def svd(trainset, testset, predset):

    modelname = 'svd'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return

    algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1)
    print('SVD Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
def svd_factorization():
    """
    Predict games for user with user_key = 158123
    """
    target_user_key = 158123

    run_reduce_dataset = True

    # reduce dataset:
    if run_reduce_dataset:
        df = import_all_reviews()
        df_reduced = reduce_reviews(df)
        export_reviews(df_reduced)

    # import reduced dataset:
    df = import_reduced_reviews()

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    # check out our user:
    df_target_user = df[df['user_key'] == target_user_key]

    # build utility matrix:
    data_pivot = df.pivot(index='user_key',
                          columns='game_key',
                          values='rating')

    # calculate sparsity
    sparsity = data_pivot.isnull().sum().sum() / data_pivot.size
    print('Sparcity of utility matrix: ' + str(sparsity))

    # reader belongs to Scikit-surprise
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # split in training and test set
    trainset, testset = train_test_split(data, test_size=0.2)

    # apply SVD algorithm:
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Evaluation:
    rsme = accuracy.rmse(predictions)
    print('RSME of: ' + str(rsme))

    ### Prediction for target user:
    # Predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = Dataset.load_from_df(
        df_target_user[['user_key', 'game_key', 'rating']], reader)
    predictions = algo.test(testset)
Beispiel #3
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
Beispiel #4
0
class MangakiSSVD(RecommendationAlgorithm):
    def __init__(self, rank=10, nb_iterations=20, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SVD(n_factors=rank, n_epochs=nb_iterations)

    def fit(self, X, y):
        self.reader = Reader(rating_scale=(y.min(), y.max()))
        data = Dataset.load_from_df(pd.DataFrame(np.column_stack((X, y))),
                                    self.reader)
        train = data.build_full_trainset()
        self.chrono.save('prepare data')
        self.model.fit(train)
        self.chrono.save('fit')

    def predict(self, X):
        y = np.repeat(0, len(X))
        data = Dataset.load_from_df(pd.DataFrame(np.column_stack((X, y))),
                                    self.reader)
        train = data.build_full_trainset()
        test = train.build_testset()
        pred = self.model.test(test)
        return np.array([rating.est for rating in pred])

    def get_shortname(self):
        return 'ssvd'
Beispiel #5
0
def fit_and_predict():

    try:
        db = DbCursor()
    except Exception as ex:
        return []
    else:
        sql = 'select * from user_video'
        user_videos = db.get(sql)

        if user_videos.__len__() > 0:

            df = pd.DataFrame(user_videos)

            reader = Reader(rating_scale=(0, 100))
            data = Dataset.load_from_df(df[['user_id', 'video_id', 'percent']],
                                        reader)

            train_set = data.build_full_trainset()
            algo = SVD()
            algo.fit(train_set)

            test_set = train_set.build_anti_testset()
            predictions = algo.test(test_set)

            top_n = get_top_n(predictions, n=10)

            return top_n
        else:
            return []
Beispiel #6
0
def modelo_svd_best_n(data):
    reader = Reader(rating_scale=(1, 5))
    # 'lr_all':[0.01,0.002,0.005],
    #'reg_all':[0.01,0.02,0.04],

    data = Dataset.load_from_df(
        data[['userid', 'businessid', 'mean_by_business']], reader)
    param_grid = {
        'n_factors': [5, 20, 50, 100],
        'n_epochs': [100, 200, 300],
    }

    gs = Gridsearch_svd(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=5)
    gs.fit(data)
    # combination of parameters that gave the best RMSE score
    k = gs.best_params['rmse']['n_factors']
    n_epochs = gs.best_params['rmse']['n_epochs']

    #Predictions with best parameters

    data_ = data.build_full_trainset()
    algo = SVD(n_factors=k, n_epochs=n_epochs)
    algo.fit(data_)
    prediciones = algo.test(data_.build_anti_testset())

    return prediciones
Beispiel #7
0
def run_svd(dataset):


    # Load the movielens_hetesage-100k dataset (download it if needed),
    data = Dataset.load_builtin(dataset)

    # sample random trainset and testset
    # test set is made of 25% of the ratings.
    trainset, testset = train_test_split(data, test_size=.33)

    # We'll use the famous SVD algorithm.
    algo = SVD()

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Then compute RMSE
    accuracy.rmse(predictions)
    y_test = [item[2] for item in testset]
    preds = [pred[3] for pred in predictions]
    preds_round = np.rint(preds)
    rmse_round = np.sqrt(np.mean(np.square(np.array(preds_round - np.array(y_test)))))
    print(f'rmse_round {rmse_round}')
    utils.hist_plot(y_test, preds, preds_round)
Beispiel #8
0
def get_recommendation(user):

    conn = pymysql.connect(Account.link,
                           Account.user,
                           Account.password,
                           Account.db,
                           charset="utf8mb4")

    df = pd.read_sql_query('SELECT * FROM USERS', conn)

    if (df.empty):
        return "Error - empty DF"

    conn.close()

    # Anime can be rated from 1 - 10
    data = Dataset.load_from_df(df, Reader(rating_scale=(1, 10)))

    data.split(n_folds=10)
    algo = SVD()

    trainset = data.build_full_trainset()
    algo.train(trainset)

    # predict ratings for all pairs (user, score) that are NOT in the train set
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    # Get top 15 predictions
    top_n = get_top_n(predictions, n=15)

    if top_n.get(user) is None:
        return "Error - cannot find User"

    return [iid for (iid, _) in top_n.get(user)]
Beispiel #9
0
def svd(data, training, testing):
    '''
        Tune SVD parameters then calculates RMSE, coverage and running time of SVD

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of SVD with Z-score with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]}

    # optimize parameters
    grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('SVD:', param)

    # fit model using the optimized parameters
    svd = SVD(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    svd.train(training)

    # evaluate the model using test data
    predictions = svd.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
Beispiel #10
0
def predict_VSD(userid):

    df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1)
    reader = Reader(rating_scale=(1, 5))

    #使用reader格式从文件中读取数据
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']],
                                reader=reader)

    #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集
    trainset, testset = train_test_split(data, test_size=.25)

    model = SVD(n_factors=100)
    model.fit(trainset)

    predictions = model.test(testset)
    top_n = get_top_n(predictions, n=30)

    movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])
    movie_titles = movie_titles.rename(columns={'id': 'movieId'})
    movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'],
                                            errors='coerce').fillna(0)
    movie_titles['movieId'] = movie_titles['movieId'].astype('int')
    movie_titles.drop_duplicates()

    for uid, user_ratings in top_n.items():
        if (uid == userid):
            title_list = [iid for (iid, _) in user_ratings]
            #print(uid, [iid for (iid, _) in user_ratings])
            #print(title_list)
            #print(uid, title_list)

    titles = movie_titles[movie_titles.movieId.isin(title_list)]
    print(titles[2:])
    return titles[2:]
Beispiel #11
0
class RecipeRecommender:
    def __init__(self, n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02):
        self.model = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=2020)

    def fit(self, reviews):
        # SurPRISE supports only pandas DataFrame or folds as data input
        data = Dataset.load_from_df(
            DataFrame(reviews), 
            Reader(rating_scale=(1, 5))
        )
        self.trainset = data.build_full_trainset()
        self.testset = self.trainset.build_anti_testset()
        return self.model.fit(self.trainset)

    def predict(self, n=20):
        self.predictions = self.model.test(self.testset)
        recommended_dict = RecipeRecommender.get_top_n(self.predictions, n=n)
        return [id_tuple[0] for id_tuple in recommended_dict[1]]

    @staticmethod
    def get_top_n(predictions, n):
        top_n = defaultdict(list)
        for uid, iid, _, est, _ in predictions:
            top_n[uid].append((iid, est))
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]
        return top_n
Beispiel #12
0
def main():
    """
        ...
    """

    #get data from surprise
    data = Dataset.load_builtin('ml-100k')

    trainset, testset = train_test_split(data, test_size=.25)

    algo = SVD()

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    #calculate the delta
    x = [elem[2] - elem[3] for elem in predictions]

    #number of column in the graph
    clmnNb = 69

    plt.hist(x, clmnNb, facecolor='b', alpha=0.75)

    plt.xlabel('Delta values')
    plt.ylabel('Number of same delta')
    plt.title('Delta of rating')

    plt.show()
def surpriseSVD(movieLensDataPath='data_clean.txt'):
    ''' Basic use of the surprise SVD algorithm. '''
    ''' Params: movieLensDataPath is the path to the movielens data we're looking at. '''
    ''' Note: replace with cleaned data. '''
    ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.'''

    # Load the data as a pandas data frame, as reading from text didn't quite work at first.
    df = pd.read_csv(movieLensDataPath, sep="\t", header=None)
    df.columns = ["User Id", "Movie Id", "Rating"]

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    # The columns are User Id, Movie Id, and Rating.
    data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader)
    # To fit to the SVD algorithm, we have to convert it to a trainset.
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # U and V!
    algop = algo.pu
    algoq = algo.qi

    # Simple crossvalidation
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
    # Return U (pu) and V (qi)
    return algop, algoq
Beispiel #14
0
    def getPrediction(UserId):
        ratings_dict = {
            "userID": [1, 1, 3, 4, 4, 6],
            "POIID": [1, 2, 1, 4, 2, 6],
            "rating": [5, 5, 1, 4, 5, 3],
        }

        #users = User.objects()
        #for user in users:
        #	print(user)

        frame = pd.DataFrame(ratings_dict)

        print(frame)
        reader = Reader(rating_scale=(1, 5))

        data = Dataset.load_from_df(frame[['userID', 'POIID', 'rating']],
                                    reader)

        cross_validate(NormalPredictor(), data, cv=2)

        trainset = data.build_full_trainset()
        algo = SVD()
        algo.fit(trainset)

        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        top_n = Predictions.get_top_n(predictions, n=10)

        for uid, user_ratings in top_n.items():
            if (uid == UserId):
                return [iid for (iid, _) in user_ratings]
class SVDModel:
    def __init__(self):
        self.model = SVD()
        self.name = 'Singular Value Decomposition'

    def best_estimator_gridsearchCV(self,
                                    data,
                                    n_epochs=[5, 10],
                                    lr_all=[0.002, 0.005],
                                    reg_all=[0.4, 0.5],
                                    cv=3):
        param_grid = {
            'n_epochs': n_epochs,
            'lr_all': lr_all,
            'reg_all': reg_all
        }
        gs = GridSearchCV(self.model, param_grid, measures=['rmse'], cv=cv)
        gs.fit(data)
        gs.best_params['rmse']
        return params

    def train(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)

    def predict(self, *args, **kwargs):
        self.model.predict(*args, **kwargs)

    def test(self, *args, **kwargs):
        return self.model.test(*args, **kwargs)
def collaborative():
    conn = sqlite3.connect("mf.sqlite3")
    movies = pd.read_sql_query(
        "select title, poster_path, runtime, genres, vote_average, vote_count from movies",
        conn)
    ratings = pd.read_sql_query("select * from ratings", conn)
    reader = Reader()
    data = Dataset.load_from_df(ratings[['userid', 'movieid', 'rating']],
                                reader=reader)
    svd = SVD()
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    trainset = data.build_full_trainset()
    print(trainset)
    testset = trainset.build_anti_testset()
    predictions = svd.test(testset)

    top_n = get_top_n(predictions, n=10)

    # Print the recommended items for each user
    recommendations = {}
    for uid, user_ratings in top_n.items():
        recommendations[uid] = [iid for (iid, _) in user_ratings]
    with open('catalog/output.py', 'w') as filehandle:
        filehandle.write('recommendations=')
        filehandle.write(json.dumps(recommendations))

    return recommendations
def recommend_place(user_id):
    try:
        find_user_rating = 'SELECT * FROM rating_place where user_id=%(user_id)s;'
        params = {"user_id" : int(user_id)}
        user_rating = read_data_from_db(find_user_rating, params)
        sql = 'SELECT user_id, place_id, rating FROM rating_place'
        ds = read_data_from_db(sql, None)

        if len(ds) > 0 and len(user_rating) >0:
            reader = Reader()
            data = Dataset.load_from_df(ds[['user_id', 'place_id', 'rating']], reader=reader)
            alg = SVD()
            alg.fit(data.build_full_trainset())

            iids = ds['place_id'].unique()
            rated_iids = ds.loc[ds['user_id'] == user_id, 'place_id']
            iids_to_pred = np.setdiff1d(iids, rated_iids)
            testset = [[user_id, iid, 4.] for iid in iids_to_pred]
            predictions = alg.test(testset)
            evaluate_surprise_alg(predictions)
            predictions.sort(key=lambda x: x.est, reverse=True)
            list_of_ids = []
            for i in range(50 if len(predictions) >= 50 else len(predictions)):
                list_of_ids.append(int(predictions[i].iid))
            similar_places = get_list_db_objects_from_ids(tuple(list_of_ids))
            return Response(similar_places.to_json(orient="records"), status=200, mimetype='application/json')
        return "not found", 404
    except Exception as e:
        print(str(e))
        return "", 500
def testreview():
    df1 = pd.DataFrame(my_client['mimi']['review'].find())
    df2 = pd.DataFrame(my_client['mimi']['appReview'].find())
    df = pd.concat([df1, df2]).reset_index()
    store_df = pd.DataFrame(my_client['mimi']['store'].find())
    store_addr = {}
    store = store_df.values.tolist()
    for s in store:
        store_addr[s[0]] = s[1:]

    # Load the dataset (download it if needed)
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(df[["userName", "resId", "rating"]], reader)
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    # testset = trainset.build_full_trainset()
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions, store_addr)
    # Print the recommended items for each user

    # recom_qs = pd.DataFrame.my_client['mimi']['recommand'].find("Uid" : mid)
    x = my_client['mimi']['recommand'].insert_many(top_n)
    print(len(x))
Beispiel #19
0
def train_benchmark():
    # Load the movielens-100k dataset (download it if needed),
    data = Dataset.load_builtin('ml-100k')

    # sample random trainset and testset
    # test set is made of 25% of the ratings.
    trainset, testset = train_test_split(data, test_size=.25)

    # We'll use the famous SVD algorithm.
    algo = SVD()

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)

    #print benchmark:
    predictions = algo.test(testset)
    print(accuracy.rmse(predictions))

    algo_filename = 'rec_algo.pkl'
    testset_filename = 'testset.pkl'
    with open(algo_filename, 'wb') as f:
        print("saving model to disk")
        pickle.dump(algo, f)

    with open(testset_filename, 'wb') as f:
        print("saving testset to disk")
        pickle.dump(testset, f)
Beispiel #20
0
def surprise_SVD(train_file, test_file):
    """
    Svd with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_all: The learning rate for all
        reg_all : The regularization term for all


    Returns:
        numpy array: predictions
    """
    print("SVD")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm
    algo = SVD(n_epochs=30, lr_all=0.01, reg_all=0.1)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Beispiel #21
0
def get_start(user=85):
    ml = MovieLens()
    print("Loading movie ratings...")
    data = ml.loadMovieLensLatestSmall()

    testSubject = user
    user_preference(testSubject, ml)

    print(
        "\nBuilding SVD recommendation model using the WHOLE dataset as trainSet(only for test)..."
    )
    trainSet = data.build_full_trainset(
    )  # Do not split the dataset into folds and just return a trainset as is, built from the whole dataset.

    algo = SVD()
    algo.fit(trainSet)

    print("Computing recommendations...")
    testSet = BuildAntiTestSetForUser(testSubject, trainSet)
    predictions = algo.test(testSet)

    recommendations = []

    print("\nWe recommend:")
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        intMovieID = int(movieID)
        recommendations.append((intMovieID, estimatedRating))

    recommendations.sort(key=lambda x: x[1], reverse=True)

    for ratings in recommendations[:10]:
        print(ml.getMovieName(ratings[0]))
def solve_matrix_factorisation(pathw):
    reader = Reader(line_format='user item rating timestamp', sep=',')
    data = Dataset.load_from_file(pathw, reader=reader)
    data.split(n_folds=5)

    # param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
    #               'reg_all': [0.08, 0.1, 0.15]}
    # gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
    # gs.fit(data)
    # algo = gs.best_estimator['rmse']
    # print(gs.best_score['rmse'])
    # print(gs.best_params['rmse'])
    # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # print("reached")
    # Use the new parameters with the train data
    algo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)

    trainset = data.build_full_trainset()
    algo.fit(trainset)
    print("fitting crossed")
    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions, n=10)
    # Print the recommended items for each user
    for uid, user_ratings in top_n.items():
        if uid == '615':
            # print(uid, [iid for (iid, _) in user_ratings])
            return [iid for (iid, _) in user_ratings]
    def collaborative(self, ratings, user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings
        data = Dataset.load_from_df(
            temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)
        count = 0

        for uid, iid, true_r, est, _ in predictions:
            if uid == user_id:
                count = count + 1
                temp_ratings.loc[len(temp_ratings) + 1] = [uid, iid, est]

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][[
            'book_id', 'rating'
        ]]

        return (cb)
Beispiel #24
0
def get_svd_recommender(df, test_size=0.25, path="", exists=False):
    """
    builds and trains an SVD recommender
    :param df: a dataframe containing user ID's, beer ID's and ratings
    :param test_size: the fraction of samples that should be reserved for testing
    :param path: the path to an existing svd recommender that was saved to a file
    :param exists: whether or not to upload the algo from a saved file
    :return: trained recommender, list of predictions, and the root mean square error of the recommender
    """
    if exists:
        return dump.load(path)[1]

    # allows surprise to read df
    reader = Reader(rating_scale=(1, 5))
    # must load in particular column order
    data = Dataset.load_from_df(df[['user_id', 'beer_id', 'user_score']],
                                reader)

    trainset, testset = train_test_split(data, test_size=test_size)
    algo = SVD()
    # Train the algorithm on the trainset
    algo.fit(trainset)
    # and predict ratings for the testset. test() returns a list of prediction objects
    # which have several attributes such as est (the prediction) and r_ui (the true rating)
    predictions = algo.test(testset)

    # rmse below 1 is considered low
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)

    return algo, predictions, rmse
Beispiel #25
0
def retrain():
    file = os.path.join(cwd, 'src', 'rec_sys', 'rec_methods', 'data',
                        'custom_dataset.data')

    # 1. Load the dataset
    data = Dataset.load_from_file(file, reader=reader)
    logger.info("> dataset OK")

    # 2. Creating train dataset...
    trainset = data.build_full_trainset()
    logger.info("> train dataset OK")

    # 3. Training...
    algo = SVD()
    algo.fit(trainset)
    logger.info("> Training OK")

    # 4. Predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    logger.info("> Predictions OK")

    top_n = get_top_n(predictions, n=5)
    logger.info("Top N retrieved > OK")

    return top_n
Beispiel #26
0
def recommend(given_user_id):
    # given_user_id = int(get_object_or_404(User, username=given_user_id).id)
    print(given_user_id, "recommend function printing given_user_id")
    queryset = Rate.objects.all()
    query, params = queryset.query.as_sql(
        compiler='django.db.backends.sqlite3.compiler.SQLCompiler',
        connection=connections['default'])
    df = pd.read_sql_query(query, con=connections['default'], params=params)
    print("load df")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader)
    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()
    algo = SVD()
    algo.fit(trainset)
    print("fit 완료")
    predictions = algo.test(testset)
    print("예측 완료")
    top_10_items = get_top_n(predictions, 10, given_user_id)
    print("top 10 선별 완료, 길이 : %s" % len(list(top_10_items.keys())))
    print(top_10_items[given_user_id])
    for item_prediction in top_10_items[given_user_id]:
        if Prediction.objects.filter(item_id=item_prediction[0],
                                     user_id=given_user_id):
            pass
        else:
            obj = Prediction(user_id=given_user_id,
                             item_id=item_prediction[0],
                             prediction=round(item_prediction[1], 1))
            obj.save()
    print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id)
    # return [item_prediction[0] for item_prediction in top_10_items[given_user_id]]
    return top_10_items[given_user_id]
Beispiel #27
0
def predict_ratings(data):
    """
	可以简单地将算法适合整个数据集,
	而不是运行交叉验证。
	这可以通过使用build_full_trainset()将创建trainset对象的方法来完成
	可以通过直接调用该predict()方法来预测收视率
	:return:
	"""
    trainset = data.build_full_trainset()

    svg = SVD()
    svg.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = svg.test(testset)

    algo = KNNBasic()
    algo.fit(trainset)

    #收视率预测:假设对用户196和项目302感兴趣(确保它们在trainset中!),并且知道真实的评分rui=4
    uid = str(196)
    iid = str(302)

    # algo.predict(uid,iid,r_ui=4,verbose=True)

    return predictions
Beispiel #28
0
    def SVDTopNRecs(self, ml, userId, n):

        #Using recommender SVD
        SVDAlgorithm = SVD(n_factors=100, random_state=10)

        #Building recommendation model...
        trainSet = self.dataset.GetFullTrainSet()
        SVDAlgorithm.fit(trainSet)

        #Computing recommendations...
        testSet = self.dataset.GetAntiTestSetForUser(userId)

        predictions = SVDAlgorithm.test(testSet)

        recommendations = []

        #filtering movieid and estimate rating from predictions to recommendations
        for userID, movieID, actualRating, estimatedRating, _ in predictions:
            intMovieID = int(movieID)
            recommendations.append((intMovieID, estimatedRating))

        #Sorting the recommendations list using ratings in descending order to return top n recs
        recommendations.sort(key=lambda x: x[1], reverse=True)
        recommendations = recommendations[:n]

        return recommendations
Beispiel #29
0
    def train(self):
        # 점수 1~ 10
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        # TrainSet
        trainset = data.build_full_trainset()
        # algo = self.checkBestAlgorithm()[0]
        algo = SVD()
        algo.fit(trainset)
        # TestSet
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        self.predictions = predictions
        self.algo = algo

        # Validate Algo
        cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)
        # Save Dump
        dump.dump(file_name, predictions=predictions, algo=algo)
Beispiel #30
0
def svd_model(df):
    """
    Creates svd model for predcitions and cross validation
    Returns: data 
    """
    from surprise.model_selection.split import train_test_split
    data = df[['user_id', 'business_id',
               'average_stars']].loc[df.city == 'Scottsdale']

    reader = Reader()

    data = Dataset.load_from_df(data, reader)

    trainset, testset = train_test_split(data, test_size=0.25)

    algo = SVD()
    algo.fit(trainset)

    predictions = algo.test(testset)

    acc = accuracy.rmse(predictions)

    svd_cv = cross_validate(SVD(), data, cv=5)

    return data, acc, svd_cv['test_rmse']
    def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
from surprise import Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold


data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb
then reloaded and can be used again for making predictions.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')