Esempio n. 1
0
def test_performances():
    """Test the returned dict. Also do dumping."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    algo = NormalPredictor()
    tmp_dir = tempfile.mkdtemp()  # create tmp dir
    performances = evaluate(algo,
                            data,
                            measures=['RmSe', 'Mae'],
                            with_dump=True,
                            dump_dir=tmp_dir,
                            verbose=2)
    shutil.rmtree(tmp_dir)  # remove tmp dir

    print(performances)
    assert performances['RMSE'] is performances['rmse']
    assert performances['MaE'] is performances['mae']
Esempio n. 2
0
def test_unknown_user_or_item():
    """Ensure that all algorithms act gracefully when asked to predict a rating
    of an unknown user, an unknown item, and when both are unknown.
    """

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))

    file_path = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'

    data = Dataset.load_from_file(file_path=file_path, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
               KNNBaseline, SVD, SVDpp)
    for klass in klasses:
        algo = klass()
        algo.train(trainset)
        algo.predict(0, 'unknown_item')
        algo.predict('unkown_user', 0)
        algo.predict('unkown_user', 'unknown_item')
Esempio n. 3
0
    def get_my_recs(self, my_ratings_dict):

        self.my_rated_movies = list(my_ratings_dict.keys())
        personal_dict = {'movieId' : list(my_ratings_dict.keys()), 'rating' : list(my_ratings_dict.values())}

        personal_df = pd.DataFrame(personal_dict)

        id_to_title = {mid: m for m, mid in zip(self.movies_df['title'].values, self.movies_df['movieId'])}

        # arbitrarily calling ourselves userId #10000
        personal_df['userId'] = [10000 for _ in range(len(personal_df))]

        personal_df = scale_ratings(personal_df)

        new_df = pd.concat([self.ratings_df, personal_df], ignore_index=True)
        # new_df_scaled = scale_ratings(new_df)
        scaled_reader = Reader(rating_scale=(0, 1))
        new_data = convert_df_to_data(new_df, scaled_reader)
        new_data.split(n_folds=5)

        new_trainset = new_data.build_full_trainset()
        self.model.train(new_trainset)

        unique_movie_ids = self.ratings_df['movieId'].unique()
        my_predictions = [self.model.predict(10000, mid) for mid in unique_movie_ids if id_to_title[mid] not in self.my_rated_movies]
        my_predictions = sorted(my_predictions, key=lambda pred : pred.est, reverse=True)

        my_estimates = [(id_to_title[pred.iid], pred.est) for pred in my_predictions]

        self.my_estimates = my_estimates
Esempio n. 4
0
def get_your_recs(personal_df, df, movies_df, model):

    # arbitrarily calling ourselves userId #700
    personal_df['userId'] = [700 for _ in range(len(personal_df))]

    new_df = pd.concat([df, personal_df], ignore_index=True)
    new_df_scaled = scale_ratings(new_df)
    scaled_reader = Reader(rating_scale=(0, 1))
    new_data = convert_df_to_data(new_df_scaled, scaled_reader)
    new_data.split(n_folds=5)

    new_trainset = new_data.build_full_trainset()
    model.train(new_trainset)

    unique_movie_ids = df['movieId'].unique()
    my_predictions = [model.predict(700, mid) for mid in unique_movie_ids]

    my_highest_est = np.array([pred.est for pred in my_predictions]).max()

    my_top_recs_ids = [pred.iid for pred in my_predictions if pred.est >= my_highest_est]

    id_to_title = {mid: m for m, mid in zip(movies_df['title'].values, movies_df['movieId'])}
    # id_to_title = {mid : title for title, mid in movies_id_dict.items()}
    my_top_recs = [id_to_title[mid] for mid in my_top_recs_ids]

    my_predictions = sorted(my_predictions, key=lambda pred : pred.est, reverse=True)
    all_my_estimates = [(id_to_title[pred.iid], pred.est) for pred in my_predictions]

    all_my_estimates_2016 = [est for est in all_my_estimates if '(2016)' in est[0]]

    return my_top_recs, all_my_estimates_2016, all_my_estimates
Esempio n. 5
0
def load_from_panda(df):
    """
    Transform pandas dataframe into surprise dataframe

    Args:
        df: pandas dataframe to transform

    Returns:
        data: surprise dataframe
    """
    #['userId', 'movieId', 'rating', 'timestamp']
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df, reader)
    return data
Esempio n. 6
0
def test_knns():
    """Ensure the k and min_k parameters are effective for knn algorithms."""

    # the test and train files are from the ml-100k dataset (10% of u1.base and
    # 10 % of u1.test)
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when
    # there are not enough neighbors, we can't really test them...
    klasses = (KNNBasic, )  # KNNWithMeans, KNNBaseline)

    k, min_k = 20, 5
    for klass in klasses:
        algo = klass(k=k, min_k=min_k)
        for trainset, testset in data.folds():
            algo.train(trainset)
            predictions = algo.test(testset)
            for pred in predictions:
                if not pred.details['was_impossible']:
                    assert min_k <= pred.details['actual_k'] <= k
Esempio n. 7
0
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
file_path = 'ratings_robotics.dat'
reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=' ')
data = Dataset.load_from_file(file_path, reader=reader)
#data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.train(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)
r = redis.Redis('localhost')
r.hset('test', 'suggestions_dict',json.dumps(top_n))

# Print the recommended items for each user
Esempio n. 8
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import SVDpp
from surprise.dataset import Dataset
from surprise.dataset import Reader
from surprise.evaluate import evaluate

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))


def test_SVD_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVD(n_factors=1, n_epochs=1)
    rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']

    # n_factors
    algo = SVD(n_factors=2, n_epochs=1)
    rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_factors

    # n_epochs
from sklearn.metrics import jaccard_similarity_score as jaccard

# data preparation for collaborative filtering
files=glob('homework/*.txt')
users=pd.read_csv(files[0], sep='\t', header=0)
hotels=pd.read_csv(files[1], sep='\t', header=0)
activity=pd.read_csv(files[2], sep='\t', header=0).drop(16135).reset_index().drop('index', axis=1)  # remove the outlier
activity_count=activity.assign(browse=1).groupby(['user', 'hotel']).count().reset_index()
data=users.assign(key=1).merge(hotels.assign(key=1), on='key', how='inner').drop('key', axis=1)
data=data.merge(activity_count, on=['user', 'hotel'], how='left')
data['browse']=data.browse.fillna(0)
data=data[['user', 'hotel', 'browse']]


# tentatively CV test for some algorithms
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(data, reader)

data_cv=data
data_cv.split(n_folds=5)

# SVD test
svd = SVD()
perf = evaluate(svd, data, measures=['RMSE'])
print_perf(perf)      # MSE 0.052

param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005],
              'reg_all': [0.05, 0.1, 0.5]}
gs = GridSearch(SVD, param_svd, measures=['RMSE'])
gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1
Esempio n. 10
0
        self.model.train(new_trainset)

        unique_movie_ids = self.ratings_df['movieId'].unique()
        my_predictions = [self.model.predict(10000, mid) for mid in unique_movie_ids if id_to_title[mid] not in self.my_rated_movies]
        my_predictions = sorted(my_predictions, key=lambda pred : pred.est, reverse=True)

        my_estimates = [(id_to_title[pred.iid], pred.est) for pred in my_predictions]

        self.my_estimates = my_estimates

if __name__ == '__main__':

    movies_df = pd.read_csv('data/movies/movies.csv')
    # data = Dataset.load_builtin('ml-100k')
    # data.split(n_folds=5)
    scaled_reader = Reader(rating_scale=(0, 1))
    reader = Reader(rating_scale=(1, 5))
    df = pd.read_csv('data/movies/ratings.csv')
    scaled_df = scale_ratings(df)
    scaled_data = convert_df_to_data(scaled_df, scaled_reader)
    scaled_data.split(n_folds=5)

    data = convert_df_to_data(df, reader)
    data.split(n_folds=5)

    # plot some EDA figures:
    plot_average_rating_hist(df)

    # Cross Valdiation Tests for different Classification Models:
    models = []
    models.append(('GM', GlobalMean()))
Esempio n. 11
0
from surprise.dataset import Reader





filePlace = "C:\\Users\\22560\\PycharmProjects\\lastFM\\networkData\\"

gc.collect()
# read train data
os.chdir("C:\\Users\\22560\\PycharmProjects\\lastFM\\hetrec2011-lastfm-2k")
train = pd.read_csv("trainAfterReg.csv")


algo = SVD()
reader = Reader(rating_scale=(train.weight.min(),train.weight.max()))
data = Dataset.load_from_df(train[['userID', 'artistID', 'weight']], reader)

data.split(3)

perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)