Python KNNWithMeans.train Examples

Programming Language: Python

Namespace/Package Name: surprise

Class/Type: KNNWithMeans

Method/Function: train

Examples at hotexamples.com: 7

Python KNNWithMeans.train - 7 examples found. These are the top rated real world Python examples of surprise.KNNWithMeans.train extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

KNNWithMeans(30)

fit(30)

predict(30)

test(30)

train(7)

get_neighbors(4)

__init__(2)

sim(2)

compute_similarities(1)

prepare_model(1)

Example #1

Show file

def knn_m(data, training, testing):
    '''
        Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of KNN with Means with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False)
    knnm_grid_search.evaluate(data)
    param = knnm_grid_search.best_params['RMSE']
    print('KNNWithMeans:', param)

    # fit model using the optimized parameters
    knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'],
                        min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnm.train(training)

    # evaluate the model using test data
    predictions = knnm.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n

Example #2

Show file

def knnm_running_time(data):
    '''
        Calculates the running times for training and predictions for KNN with Means

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnMeanstrain: running time for training
            elapsedtime_KnnMeanstest: running time for predictions on testset
    '''
    elapsedtime_KnnMeanstrain = []
    elapsedtime_KnnMeanstest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNWithMeans,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knnm = KNNWithMeans(k=k,
                            name=sim,
                            min_support=min_support,
                            user_based=user_based)
        knnm.train(training)
        elapsedtime_KnnMeanstrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knnm.test(testing)
        elapsedtime_KnnMeanstest.append(time.time() - test_start)
    return elapsedtime_KnnMeanstrain, elapsedtime_KnnMeanstest

Example #3

Show file

def model_training_and_evalution():
    print "欢迎来到 训练阶段"
    file_path=os.path.expanduser(r'E:\JiangIntellijWorkingSpace\tools\music_recommendation\transform_playlist_song_rating.txt')
    reader=Reader(line_format='user item rating',sep='\t')
    music_data=Dataset.load_from_file(file_path,reader=reader)
    print("构建数据集")
    trainset=music_data.build_full_trainset()
    print"开始训练模型....."
    sim_options={'name':'pearson_baseline','user_based':False}
    algo=KNNWithMeans(sim_options)
    algo.train(trainset)
    rid_to_name,name_to_rid=read_item_names()
    # print name_to_rid
    toy_story_raw_id=name_to_rid[u'Over The Horizon-SAMSUNG GALAXY THEME']
    # toy_story_raw_id=423245641
    print toy_story_raw_id
    toy_story_inner_id=algo.trainset.to_inner_iid(toy_story_raw_id)
    toy_story_neighbors=algo.get_neighbors(toy_story_inner_id,k=10)
    toy_story_neighbors=(algo.trainset.to_raw_iid(inner_id)for inner_id in toy_story_neighbors)
    toy_story_neighbors=(rid_to_name[rid]for rid in toy_story_neighbors)
    print('the 10 nearest neighbors of it are(为你推荐最相近的10首歌单):')
    for music in toy_story_neighbors:
        print music

Example #4

Show file

file_path = os.path.expanduser('./data/163_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print "构建数据集..."
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

#查找最近的user
print "开始训练模型..."
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNWithMeans()
algo.train(trainset)

current_playlist = list(name_id_dic.keys())[39]
print "歌单名称", current_playlist

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print "歌单id", playlist_id
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print "内部id", playlist_inner_id

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字

Example #5

Show file

File: recsys.py Project: databill86/RecommenderSystem

class recommender:
    def __init__(self, algorithm):

        # Always call base method before doing anything.
        self.name = algorithm.lower()  # SVD, NMF, SAE, LSTM
        self.surprise_algorithms = ['svd', 'nmf', 'knnbasic', 'knnmeans']
        self.devooght_algorithms = ['fism']
        '''
         To implement with surprise:
             - Matrix-Factorization Based:
                 SVDpp: The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
             - Neighbourhood-based:
                 Coclustering
                 KNNWithZScore: A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.
                 KNNBaseline: A basic collaborative filtering algorithm taking into account a baseline rating.
             - Random Predictor    
                 NormalPredictor: Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
             - Baseline    
                 BaselineOnly: Algorithm predicting the baseline estimate for given user and item.
             - Slope One
                 SlopeOne: A simple yet accurate collaborative filtering algorithm.

        To implement using RNN:
            - LSTM 
            - GRU (Devooght, Bersini)
            - GRU with clustering (Devooght, Bersini)
            
        To extract latent factors:
            - Stacked Autoencoders
            - CNN
            - CNN with Stacked Autoencoders
        '''

        self.df_known_predictions = None
        self.df_unknown_predictions = None
        self.known_sequence_dict = None
        self.unknown_sequence_dict = None
        self.k = None
        self.k_min = None
        self.metrics = None

    def get_name(self, verbose=False):
        return self.name

    def fit(self,
            df_ratings=None,
            columns=['userId', 'itemId', 'rating'],
            verbose=False,
            **kwargs):

        self.columns = np.array(columns)
        # If Surprise lib is the base package to fit, then df_ratings must be used.
        # Algorithms that use Surprise Lib: NMF, SVD, KNN, SVDpp

        if (df_ratings is not None):
            self.df_ratings = df_ratings.copy()

        ###########################################
        # Convert Utility Matrix to df_ratings if utility matrix is passed
        #
        #
        ###########################################

        if self.name in self.surprise_algorithms:  # Surprise-based recommenders
            from surprise import Dataset
            from surprise import Reader

            # A reader is still needed but only the rating_scale param is required.
            # The Reader class is used to parse a file containing ratings.
            reader = Reader(rating_scale=(0.5, 5.0))

            # Separating timestamp column
            if ('timestamp' in columns):
                self.df_timestamp = self.df_ratings['timestamp'].copy()
                self.df_ratings.drop(labels='timestamp', inplace=True, axis=1)

            # The columns must correspond to user id, item id and ratings (in that order).
            data = Dataset.load_from_df(
                self.df_ratings[self.columns[np.where(
                    self.columns != 'timestamp')]], reader)

            # Creting trainset variable to be used in prediction functions of Surprise
            self.trainset = data.build_full_trainset()

            # Creating Model
            if self.name == 'svd':
                from surprise import SVD

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 100
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 20
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = SVD(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'nmf':
                from surprise import NMF

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 15
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 50
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = NMF(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'knnbasic':
                from surprise import KNNBasic

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(self.k))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNBasic(k=self.k,
                                      min_k=self.k_min,
                                      verbose=verbose)

            elif self.name == 'kmeans':
                from surprise import KNNWithMeans

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(40))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNWithMeans(k=self.k,
                                          min_k=self.k_min,
                                          verbose=verbose)

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            # Train the algorithm on the trainset, and predict ratings for the testset
            self.model.train(self.trainset)

            return 0

        elif (self.name in self.devooght_algorithms):

            # Arguments
            directory_path = os.path.join(
                '.', 'Sequence_based_recommendation_files', self.name)
            preprocess.create_dirs(dirname=directory_path, verbose=verbose)

            data = preprocess.remove_rare_elements(data=df_ratings,
                                                   min_user_activity=1,
                                                   min_item_popularity=1,
                                                   verbose=verbose)

            data = preprocess.save_index_mapping(data=data,
                                                 dirname=directory_path,
                                                 separator=',')

            train_set, val_set, test_set = preprocess.split_data(
                data=data,
                nb_val_users=0.1,  # val_size
                nb_test_users=0.1,  # test_size
                dirname=directory_path,
                verbose=verbose)

            preprocess.make_sequence_format(train_set=train_set,
                                            val_set=val_set,
                                            test_set=test_set,
                                            dirname=directory_path,
                                            verbose=verbose)

            preprocess.save_data_stats(data=data,
                                       train_set=train_set,
                                       val_set=val_set,
                                       test_set=test_set,
                                       dirname=directory_path,
                                       verbose=verbose)

            # Training Algorithm
            parser = parse.command_parser(parse.predictor_command_parser,
                                          train.training_command_parser,
                                          parse.early_stopping_command_parser)

            if self.name == 'fism':
                args = parser.parse_args([
                    '--dir',
                    os.path.join(directory_path, 'models'),
                    '-d',
                    directory_path,  #directory_path + '/', 
                    '-b',
                    '20',  # Batch size: the number of training examples present in a single blatch
                    '--max_iter',
                    '50',  # Maximum number of iterations: the number of batches needed to complete one epoch
                    '--progress',
                    '10',  # when progress information should be printed during training
                    '-m',
                    self.name.upper(),  # Method
                    #'-i', '-1', # Number of batches - only on test parser
                    '--loss',
                    'RMSE',
                    '--save',
                    'Best'
                ])

                self.model = parse.get_predictor(args)

                dataset = handler.DataHandler(
                    dirname=args.dataset,
                    extended_training_set=args.extended_set,
                    shuffle_training=args.tshuffle)

                self.model.prepare_model(dataset)
                self.metrics = self.model.train(
                    dataset,
                    save_dir=args.dir,
                    time_based_progress=args.time_based_progress,
                    progress=float(args.progress),
                    autosave=args.save,
                    max_progress_interval=args.mpi,
                    max_iter=args.max_iter,
                    min_iterations=args.min_iter,
                    max_time=args.max_time,
                    early_stopping=parse.get_early_stopper(args),
                    load_last_model=args.load_last_model,
                    validation_metrics=args.metrics.split(','))

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            return 0

        else:  # if self.name not in self.surprise_algorithms
            if (verbose):
                print("Invalid algorithm: {}".format(self.name))

    def get_model(self):
        return self.model

    def get_metrics(self):
        return self.metrics

    def calculate_known_predictions(self):
        # Calculating all predictions for known items

        if self.name in self.surprise_algorithms:
            # Calculating predictions dataframe as userId, itemId, rating, prediction
            # predictions return raw uid and iid

            known_predictions = self.model.test(self.trainset.build_testset(
            ))  # Brings all predictions of existing ratings

            for prediction in known_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), prediction.r_ui, prediction.est
                ])
                if prediction == known_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_known_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

            if ('timestamp' in self.columns):
                self.df_known_predictions = self.df_known_predictions.set_index(
                    keys=['userId', 'itemId']).join(
                        df_ratings.drop('rating', axis=1).set_index(
                            keys=['userId', 'itemId'])).reset_index()

            self.df_known_predictions['userId'] = self.df_known_predictions[
                'userId'].astype(int)
            self.df_known_predictions['itemId'] = self.df_known_predictions[
                'itemId'].astype(int)

    def get_known_predictions(self, calculate_predictions=False):
        if self.df_known_predictions is None or calculate_predictions == True:
            self.calculate_known_predictions()

        return self.df_known_predictions

    def calculate_unknown_predictions(self):
        # Calculating all predictions for known items
        # predictions return raw uid and iid

        if self.name in self.surprise_algorithms:
            unknown_predictions = self.model.test(
                self.trainset.build_anti_testset(
                ))  # => Brings all predictions of non-existing ratings

            for prediction in unknown_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), 0, prediction.est
                ])
                if prediction == unknown_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_unknown_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

    def get_unknown_predictions(self, calculate_predictions=False):
        if self.df_unknown_predictions is None or calculate_predictions == True:
            self.calculate_unknown_predictions()

        return self.df_unknown_predictions

    def predict(self, userId, itemId, verbose=False):

        if self.name in self.surprise_algorithms:
            prediction = self.model.predict(
                uid=int(userId),
                iid=int(itemId))  # Take as input the raw user id and item id
            #ref: http://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict

            if prediction.details['was_impossible'] == True:
                if (verbose):
                    print(
                        "Impossible to predict item {} rating for user {} (one of them may not have been in training step)"
                        .format(itemId, userId))
                return 0
            else:
                return prediction.est

    def get_top_n(self, n=10, source='unknown', calculate_sequence=False):
        '''Return the top-N recommendation for each user from a set of predictions.
        Args:        
            n(int): The number of recommendation to output for each user. Default
                is 10.
        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n. '''

        if (source.lower() == 'known'):

            # Checking if known predictions are calculated
            if (self.df_known_predictions is None):
                self.get_unknown_predictions(calculate_predictions=True)

            if (calculate_sequence == True
                    or self.known_sequence_dict is None):
                self.known_sequence_dict = dict()

                for userId in self.df_known_predictions['userId'].unique():
                    # Selecting single user
                    df_user = self.df_known_predictions[
                        self.df_known_predictions['userId'] == userId].copy()

                    # Sorting values by prediction
                    df_user.sort_values(by=['prediction'],
                                        ascending=False,
                                        inplace=True)

                    # Saving the first K in sequence dict
                    self.known_sequence_dict[userId] = np.array(
                        df_user['itemId'].head(n))

            return self.known_sequence_dict

Example #6

Show file

class Surprise_recommender:
    def __init__(self, reader):
        '''
        Constructor

        ------
        Args:
        reader: A reader object for the dataset object in surprise
        '''
        self.reader = reader
        return

    def create_test_set(self, test_data):
        '''
        Function to create test_set
        This function drops timestamp from the data

        ------
        Args:
        test_data: input test data
        
        ------
        Returns:
        ts: test data after removing time stamp feature
        Basically a list with the following format: user, item, rating
        '''
        ts = [[td[0], td[1], td[2]] for td in test_data]
        return ts

    def create_train_set(self, train_data):
        '''
        Function to create training set
        
        ------
        Args:
        train_data: Training set in the form of list
        
        ------
        Returns:
        Trainset object from surprise
        Basically a list with the following format: user, item, rating, timestamp
        '''
        ds = Dataset(self.reader)
        return ds.construct_trainset(train_data)

    def train_test_model(self, validation_set, train_set, test_set, algorithm,
                         task):
        '''
        Function to train models using different algorithms. Dumps GridSearch results
        for further analysis.

        ------
        Args:
        train_set: The training data formatted according to the needs of surprise
        algorithm: The algorithm for training the model
        test_set: Testing data to check RMSE and MAE after GridSearch
        validation_set: Dataset for hyperparameter optimization
        task: Make predictions for rating, sentiment scores or for combined rating

        ------
        Returns:None
        '''

        if algorithm == 'SVD':

            param_grid = {
                'n_epochs': np.arange(1, 101, 10).tolist(),
                'n_factors': [10, 50, 100]
            }
            grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'MAE'])

            grid_search.evaluate(validation_set)

            p.dump(grid_search.cv_results,
                   open('../stats/svd_results_' + task + '.p', 'wb'))
            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_params['MAE']
            validation_mae = grid_search.best_score['MAE']
            #print(validation_rmse)
            #print(validation_mae)
            print(type(grid_search.cv_results))
            print(grid_search.cv_results)

            #Test based on best training RMSE
            n_epochs = best_model_RMSE['n_epochs']
            n_factors = best_model_RMSE['n_factors']
            self.algo = SVD(n_epochs=n_epochs, n_factors=n_factors)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

        if algorithm == 'NMF':

            param_grid = {
                'n_epochs': np.arange(0, 100, 10).tolist(),
                'n_factors': [10, 100]
            }
            grid_search = GridSearch(NMF, param_grid, measures=['RMSE', 'MAE'])

            grid_search.evaluate(validation_set)

            p.dump(grid_search,
                   open('../stats/nmf_results_' + task + '.p', 'wb'))
            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_params['MAE']
            validation_mae = grid_search.best_score['MAE']
            print(validation_rmse)
            print(validation_mae)

            #Test based on best training RMSE
            n_epochs = best_model_RMSE['n_epochs']
            n_factors = best_model_RMSE['n_factors']
            self.algo = NMF(n_epochs=n_epochs, n_factors=n_factors)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

        if algorithm == 'KNNWithMeans':
            param_grid = {
                'k':
                np.arange(1, 20).tolist(),
                'sim_options': [{
                    'name': 'cosine',
                    'user_based': True
                }, {
                    'name': 'msd',
                    'user_based': True
                }, {
                    'name': 'pearson',
                    'user_based': True
                }]
            }
            grid_search = GridSearch(KNNWithMeans,
                                     param_grid,
                                     measures=['RMSE', 'MAE'])
            grid_search.evaluate(validation_set)

            p.dump(grid_search,
                   open('../stats/knn_means_results' + task + '.p', 'wb'))

            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_score['MAE']
            validation_mae = grid_search.best_score['MAE']

            #Test based on best training RMSE
            k = best_model_RMSE['k']
            sim_options = best_model_RMSE['sim_options']
            self.algo = KNNWithMeans(k=k, sim_options=sim_options)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

    def generate_top_n_recommendation(self, test_set, train_set):
        '''
        Function to generate top N recommendations
        
        ----
        Args:
        user_id: The id of the user
        test_set: The testing set as a list
        train_set: The training set as a list
        '''
        user_list = set([x[0] for x in train_set])
        print("Number of users = ", len(user_list))

        precision_list = []
        recall_list = []
        f_score_list = []
        j = 0
        for user in user_list:
            # print("===============================================================")
            # print("=====================+++++++++++++++++++++++===================")
            # print("===============================================================")
            j += 1
            if j % 1000 == 0:
                print("Touchdown, j = ", j)
            item_train = set([x[1] for x in train_set if x[0] == user])
            item_test = set([x[1] for x in test_set if x[0] == user])
            item_train_all = set([x[1] for x in train_set])
            item_test_all = set([x[1] for x in test_set])
            item_all = item_train_all.union(item_test_all)
            # print("User = "******"===============================================================")
            # print("TRain items = ",item_train)
            # print("===============================================================")
            # print("Test items = ",item_test)
            # print("ITem all = ",item_all)
            # print("Number of  test items= ",len(item_test))
            negative_items = [
                x for x in item_all
                if x not in item_train and x not in item_test
            ]
            # print("Number of negative items = ",len(negative_items))

            # Get 1000 random negative items
            negative_indices = np.random.randint(0,
                                                 len(negative_items),
                                                 size=1000)
            negative_subset = [negative_items[x] for x in negative_indices]
            # Get 5 positive items from testing set:
            positive_subset = list(item_test)
            np.random.shuffle(positive_subset)
            # print("Positive subset items = ",positive_subset)
            # print(negative_subset)
            subset = positive_subset + negative_subset
            pred_list = []
            for item in subset:
                pred = self.algo.predict(user, item, r_ui=1, verbose=False)
                pred_list.append(pred)
            predictions = sorted(pred_list, key=lambda x: x.est, reverse=True)
            # print(" =============================================================")
            precision = self.calculate_precision(predictions, positive_subset,
                                                 10)
            # print("Precision = ",precision)
            recall = self.calculate_recall(predictions, positive_subset, 10)
            # print("Recall = ",recall)
            # f_score=self.calculate_f_measure(precision,recall)
            # print("F score = ",f_score)
            precision_list.append(precision)
            recall_list.append(recall)
            # f_score_list.append(f_score)

        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        print("Mean precision = ", precision)
        print("Mean recall = ", recall)
        print("fscore=", self.calculate_f_measure(precision, recall))
        return

    def calculate_precision(self, predictions, positive_items, N):
        '''
        Function to calculate precision
        '''
        count = 0
        for i in np.arange(N):
            p = predictions[i]
            if p.iid in positive_items:
                count += 1
        precision = float(count) / N
        return precision

    def calculate_recall(self, predictions, positive_items, N):
        '''
        Function to calculate recall
        '''
        count = 0
        pred = predictions[:N]  #Get TOP N Predictions
        for p in positive_items:
            for i in pred:
                if i.iid == p:
                    count += 1
                    break

        recall = float(count) / len(positive_items)
        return recall

    def calculate_f_measure(self, precision, recall):
        '''
        Function to calculate recall
        '''
        try:
            f = 2.0 * precision * recall / (precision + recall)
        except:
            f = 0
        return f

Example #7

Show file

File: user_based.py Project: gusfelhberg/DataMining

dftest = pd.read_csv(test_file_path)
dftest = dftest.drop(['test_id', 'date'], axis=1)


# create a trainset object 
reader = Reader()
data = Dataset.load_from_df(dftrain, reader)
trainingSet = data.build_full_trainset()


# create a user-based K-nearest neighbours algorithm
# - uses the Pearson correlation to measure user similarites 
# - takes user bias into account 
sim_options = {'name':'pearson'}
algo = KNNWithMeans(sim_options=sim_options)

# train the algorithm using the training set
########### fails here with MemoryError when I try to use the full set
algo.train(trainingSet)

# use the trained algorithm to predict ratings for the test set 
# output to a csv file
f = open('ub_testOutput.csv', 'w')
for i in range (len(dftest)):
    pred = algo.predict(dftest.at[i,'user_id'], dftest.at[i, 'business_id'], r_ui=4, verbose=True)
    predRating = pred.est
    f.write(str(i) + ", " + str(predRating) + '\n')
f.close()