Ejemplo n.º 1
0
def getSimilarUsers(data,n,user_id):
    if user_id not in data['userID']:
        return []
    df = pd.DataFrame(data)
    reader = Reader()
    data = Dataset.load_from_df(df[['userID', 'eventID', 'rating']], reader)
    trainSet = data.build_full_trainset()
    sim_options = {'name': 'cosine','user_based': True}
    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    testUserInnerID = trainSet.to_inner_uid(user_id)
    similarityRow = simsMatrix[testUserInnerID]
    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != testUserInnerID):
            similarUsers.append( (innerID, score) )
    kNeighbors = heapq.nlargest(n, similarUsers, key=lambda t: t[1])
    similarUsers=[]
    for simUser in kNeighbors:
        similarUsers.append(simUser[0])
    return similarUsers
Ejemplo n.º 2
0
def build_rec_list(**params):
    ml = RatingsLoader()
    data = ml.loadDataset()
    trainset = data.build_full_trainset()
    sim_options = {'name': 'cosine', 'user_based': True}
    algo = KNNBasic(sim_options=sim_options)
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions)
    Recommend = Object.extend('Recommend')
    for uid, user_ratings in top_n.items():
        rec = Recommend()
        rec.set('uId', uid)
        rec.set('pIds', [iid for (iid, _) in user_ratings])
        rec.set('pTitles',
                [ml.getProductName(int(iid)) for (iid, _) in user_ratings])
        rec.set(
            'products',
            [json.dumps(ml.getProduct(int(iid))) for (iid, _) in user_ratings])
        rec.save()
    print('is success run')
Ejemplo n.º 3
0
def test_nearest_neighbors():
    """Ensure the nearest neighbors are different when using user-user
    similarity vs item-item."""

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)

    data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
    data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5))
    trainset = data.build_full_trainset()

    algo_ub = KNNBasic(sim_options={'user_based': True})
    algo_ub.fit(trainset)
    algo_ib = KNNBasic(sim_options={'user_based': False})
    algo_ib.fit(trainset)
    assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
Ejemplo n.º 4
0
    def EvaluateAllModels(self):
        """
                         test_rmse   fit_time  test_time
        Algorithm
        SVDpp             0.965824   9.401286   0.151476
        SVD               0.967286   1.474139   0.062471
        BaselineOnly      0.972408   0.108964   0.057277
        NMF               0.992677   4.073005   0.171846
        KNNWithZScore     1.001898   0.620192   0.083341
        KNNWithMeans      1.002924   0.489803   0.078121
        SlopeOne          1.006664  19.091191   1.275676
        KNNBaseline       1.007437   0.890452   0.088495
        KNNBasic          1.016717   0.432159   0.072929
        NormalPredictor   1.253265   0.041646   0.078105
        CoClustering      1.828291   3.020921   0.052071
        :return: test_rmse sonucu en düşük olan alınır.
        """
        benchmark = []
        # Iterate over all algorithms
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NMF(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # Perform cross validation
            results = cross_validate(algorithm,
                                     self.data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # Get results & append algorithm name
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)

        result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
            'test_rmse')
        print(result)

        return result
Ejemplo n.º 5
0
def knn(data, training, testing):
    '''
        Tune Basic KNN parameters then calculates RMSE, coverage and running time of Basic KNN

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of Basic KNN with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knn_grid_search = GridSearch(KNNBasic, knn_param_grid, measures=['RMSE'], verbose=False)
    knn_grid_search.evaluate(data)
    param = knn_grid_search.best_params['RMSE']
    print('KNNBasic:', param)
    # RMSE against parameters
    result_df = pd.DataFrame.from_dict(knn_grid_search.cv_results)
    result_df.to_csv('data/knn_rmse_against_param.csv')


    # fit model using the optimized parameters
    knn = KNNBasic(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'] )
    knn.train(training)

    # evaluate the model using test data
    predictions = knn.test(testing)
    top_n = get_top_n(predictions, n=5)

    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
Ejemplo n.º 6
0
def gen_pred_matrix_ubcf(co_pe):

    # ---------------------------------------------------- UBCF as is

    # INITIALIZE REQUIRED PARAMETERS
    path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

    print "CF Type:", prnt, "BASED"

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)

    # ---------------------------------------------------- UBCF as is

    csvfile = 'pred_matrix-full_ubcf.csv'
    with open(csvfile, "w") as output:
        writer = csv.writer(output, delimiter=',', lineterminator='\n')
        writer.writerow(['uid', 'iid', 'rat'])
        for uid, user_ratings in top_n.items():
            for (iid, r) in user_ratings:
                value = uid, iid, r
                writer.writerow(value)
    print "Done! You may now check the file in same Dir. as of Program"
Ejemplo n.º 7
0
def user_based_cf(co_pe):
    # INITIALIZE REQUIRED PARAMETERS
    path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

# --------------------------------------------- MARKERS

    f = io.open("AlgoHist_ub.txt", "wb")
    f.write(repr(co_pe))
    f.close()

# --------------------------------------------- MARKERS END

    print "CF Type:", prnt, "BASED"

    # PEEKING PREDICTED VALUES
    search_key = raw_input("Enter User ID:")
    item_id = raw_input("Enter Item ID:")
    actual_rating = input("Enter actual Rating:")

    print algo.predict(str(search_key), item_id, actual_rating)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions,5)
    result_u = True

    k = input("Enter size of Neighborhood (Min:1, Max:40)")

    inner_id = algo.trainset.to_inner_iid(search_key)
    neighbors = algo.get_neighbors(inner_id, k=k)
    print "Nearest Matching users are:"
    for i in neighbors:
        print "\t "*6,i
    return top_n, result_u
Ejemplo n.º 8
0
class RecommenderUserBased(Recommender):
    def __init__(self, movies, similarity='cosine'):
        super(RecommenderUserBased, self).__init__(movies)
        sim_options = {'name': similarity, 'user_based': True}
        self.algorithm = KNNBasic(sim_options=sim_options)

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20, k_inner_item=200):
        full_dataset = self.algorithm.trainset

        # watched movies
        watched = {
            full_dataset.to_inner_iid(key): value
            for key, value in watched.items()
        }

        # get similar users
        similar_users = self.get_similar_user_ids(watched, k=k_inner_item)

        # get most similar items, based on cosine similarity and most similar users
        candidates = defaultdict(float)
        for user_id, similarity in similar_users.items():
            for inner_movie_id, rate in full_dataset.ur[user_id]:
                if inner_movie_id not in watched:
                    candidates[inner_movie_id] += similarity * rate

        # return top-n movies
        movie_ids = [
            full_dataset.to_raw_iid(i)
            for i in heapq.nlargest(k, candidates, key=candidates.get)
        ]

        return self.movies.get_movie_by_movie_ids(movie_ids)
Ejemplo n.º 9
0
def algoFunc(train_data, test_data):
    SVD_var = SVD()
    print("Singular Value Decomposition :\n")
    SVD_var.fit(train_data)
    predict_var = SVD_var.test(test_data)
    SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    SVD_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nProbabilistic Matrix Factorization :\n")
    PMF_var = SVD(biased=False)
    PMF_var.fit(train_data)
    predict_var = PMF_var.test(test_data)
    PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    PMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nNon-negative Matrix Factorization :\n")
    NMF_var = NMF()
    NMF_var.fit(train_data)
    predict_var = NMF_var.test(test_data)
    NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    NMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nUser based Collaborative Filtering algorithm :\n")
    UB_var = KNNBasic(sim_options={'user_based': True})
    UB_var.fit(train_data)
    predict_var = UB_var.test(test_data)
    user_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    user_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nItem based Collaborative Filtering algorithm :\n")
    IB_var = KNNBasic(sim_options={'user_based': False})
    IB_var.fit(train_data)
    predict_var = IB_var.test(test_data)
    item_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    item_MAE_var = accuracy.mae(predict_var, verbose=True)
    print("\n")

    return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
Ejemplo n.º 10
0
def problem14():
    plotRMSE = []
    plotMAE = []
    print("-----MSD similarity in User based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': True})
    user_MSD = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(["User-based filtering", 1, user_MSD["test_rmse"].mean()])
    plotMAE.append(["User-based filtering", 1, user_MSD["test_mae"].mean()])

    print("-----Cosine similarity in User based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
    user_COS = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(["User-based filtering", 2, user_COS["test_rmse"].mean()])
    plotMAE.append(["User-based filtering", 2, user_COS["test_mae"].mean()])

    print("-----Pearson similarity in User based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': True})
    user_Pearson = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(
        ["User-based filtering", 3, user_Pearson["test_rmse"].mean()])
    plotMAE.append(
        ["User-based filtering", 3, user_Pearson["test_mae"].mean()])

    print("-----MSD similarity in Item based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': False})
    item_MSD = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(["Item-based filtering", 1, item_MSD["test_rmse"].mean()])
    plotMAE.append(["Item-based filtering", 1, item_MSD["test_mae"].mean()])

    print("-----Cosine similarity in Item based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
    item_Cos = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(["Item-based filtering", 2, item_Cos["test_rmse"].mean()])
    plotMAE.append(["Item-based filtering", 2, item_Cos["test_mae"].mean()])

    print("-----Pearson similarity in Item based Collaborative Filtering----")
    algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': False})
    item_Pearson = cross_validate(algo, data, cv=3, verbose=False)
    plotRMSE.append(
        ["Item-based filtering", 3, item_Pearson["test_rmse"].mean()])
    plotMAE.append(
        ["Item-based filtering", 3, item_Pearson["test_mae"].mean()])

    plotRMSE = pd.DataFrame(data=plotRMSE,
                            columns=["Filter", "Similarity", "RMSE"])
    plotRMSE.pivot("Similarity", "Filter", "RMSE").plot(kind="bar")
    plt.title("User vs Item (RMSE)")
    plt.ylabel("RMSE")
    plt.ylim(.9, 1.1)
    plt.show()

    plotMAE = pd.DataFrame(data=plotMAE,
                           columns=["Filter", "Similarity", "MAE"])
    plotMAE.pivot("Similarity", "Filter", "MAE").plot(kind="bar")
    plt.title("User vs Item (MAE)")
    plt.ylabel("MAE")
    plt.ylim(.7, .9)
    plt.show()
def use_knn():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using KNN')
    algo_KNN = KNNBasic()
    algo_KNN.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_KNN = algo_KNN.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_KNN)
    accuracy_mae = accuracy.mae(predictions_KNN)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
Ejemplo n.º 12
0
    def __recommend_movies(self, username):
        reader = Reader(rating_scale=(1, 10))
        df = pd.DataFrame(self.ratings_dict)
        data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
        sim_options = {
            "name": "cosine",
            'user_based': True,
            # 'min_support': 2
        }
        algo = KNNBasic(sim_options=sim_options)
        # algo = SVD()

        algo.fit(data.build_full_trainset())

        self.__get_all_movies()

        for movies in self.movies:
            prediction = algo.predict(username, movies)
            self.predictions[movies] = prediction.est

        for user_rated_movies in self.__get_user_rated_movies(
                self.__get_username_id(username)):
            del self.predictions[user_rated_movies]
Ejemplo n.º 13
0
def UBCFMSD():
    file_path = os.path.expanduser('restaurant_ratings.txt')
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    data.split(n_folds=3)

    algo = KNNBasic(  #k=x,
        sim_options={
            'name': 'MSD',
            'user_based': True
        })
    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
Ejemplo n.º 14
0
def problem15():
    plotNeighbors = []
    i = 1
    while i < 17:
        algo = KNNBasic(k=i, sim_options={'name': 'MSD', 'user_based': True})
        user = cross_validate(algo, data, cv=3, verbose=False)
        plotNeighbors.append([
            "User based Collobarative Filtering", i, user["test_rmse"].mean()
        ])
        algo = KNNBasic(k=i, sim_options={'name': 'MSD', 'user_based': False})
        item_MSD = cross_validate(algo, data, cv=3, verbose=False)
        plotNeighbors.append([
            "Item based Collaborative Filtering", i,
            item_MSD["test_rmse"].mean()
        ])
        i += 1
    plotDF = pd.DataFrame(data=plotNeighbors,
                          columns=["Classifier", "K", "Score"])
    plotDF.pivot("K", "Classifier", "Score").plot(kind="bar")
    plt.ylim(0.8, 1.6)
    plt.title("User/Item based collaborative filtering in terms of k-value")
    plt.ylabel("RMSE")
    plt.show()
Ejemplo n.º 15
0
def fit_model_surprise_basic(df, k):
    import time
    from surprise import Reader, Dataset
    from surprise import KNNBasic, KNNWithMeans, SVD, SVDpp
    from surprise.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    import pandas as pd
    start_time = time.time()

    reader = Reader(rating_scale=(0, 1))
    data_r = Dataset.load_from_df(df[['userid', 'itemid', 'event']], reader)
    daftar_algo = {
        "KNNBasicUser": KNNBasic(sim_options={"user_based": True}),
        "KNNBasicItem": KNNBasic(sim_options={"user_based": False}),
        "KNNWithMeanItem": KNNWithMeans(sim_options={"user_based": False}),
        "KNNWithMeanUser": KNNWithMeans(sim_options={"user_based": True}),
        "SVD": SVD(),
        "SVDnoBias": SVD(biased=False),
        "SVDpp": SVDpp()
    }
    trainset, testset = train_test_split(data_r, test_size=0.25)
    algo = daftar_algo[k]
    algo.fit(trainset)
    #Buat prediksi
    predictions = algo.test(testset)
    pred = pd.DataFrame(predictions)
    pred.r_ui.replace({1.0: "transaction", 0.0: "view"}, inplace=True)
    pred.r_ui.replace({
        "view": 0,
        "addtocart": 0,
        "transaction": 1
    },
                      inplace=True)
    auc = roc_auc_score(pred.r_ui, pred.est)
    end_time = time.time()

    return auc, end_time - start_time
Ejemplo n.º 16
0
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
Ejemplo n.º 17
0
def train_model(new_users, data, neighbors = 30, min_neighbors = 5, seed = 12345):
    """ Trains the KNN Basic model using the surprise package using
    the existing ratings data combined with all the new user possible combinations

    Args:
        new_users (pandas.Dataframe): The dataframe with the 'ratings' of all the possible combinations of user input
        data (pandas.Dataframe): The existing ratings dataframe 
        neighbors (int): the number of nearest neighbors to train the model on, default is 30
        min_neighbors (int): the minimum number of neighbors a user must have to receive a prediction. 
                            If there are not enough neighbors, the prediction is set the the global mean of all ratings
                            default is 5.
        seed (int): setting the random state, default is 122345
    Returns:
        predictions (list of prediction objects):  The predicted recommendations from the model
    """
	
    #ensure a nice distribution of ratings
    ratings_counts = data['rating'].value_counts().to_dict()
    logger.info("Ratings Distributions:")
    logger.info(ratings_counts)

    #combine actual ratings with all possible ratings users could input
    full_data = new_users.append(data)
	
    #use surprise Reader function to read in data in surprise format
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(full_data[['user_id', 'book_id', 'rating']], reader)
	
    trainset = data.build_full_trainset()
    algo = KNNBasic(k=neighbors, min_k=min_neighbors, random_state=seed)
    algo.fit(trainset)
        
    # predict all the cells without values
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    return predictions
Ejemplo n.º 18
0
    def fit(self, trainset):
        self.trainset = trainset
        self.trainset.rating_scale = (1, 13)
        AlgoBase.fit(self, trainset) 
        
        # sim_options = {'name': 'cosine',
        #        'user_based': True
        #        }
        
        model = KNNBasic(sim_options=self.sim_options, k = self.k)
        model.fit(trainset)
        simsMatrix = model.compute_similarities()
        
        
        for userId in range(trainset.n_users):
            
            similarityRow = simsMatrix[userId]
            kNeighbors = heapq.nlargest(10,  [(innerId, score) for (innerId, score) in  enumerate(similarityRow) if innerId!=userId], key=lambda t: t[1])
            self.nearestNeigbors[userId] = kNeighbors
            

        print("...done.")
                
        return self 
Ejemplo n.º 19
0
def RecommendMovie(user_id):

    # user_id = input((" UserID "))
    np.random.seed(0)
    random.seed(0)

    (ml, evaluationData, rankings) = LoadMovieLensData()

    evaluator = Evaluator(evaluationData, rankings)

    UserKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
    evaluator.AddAlgorithm(UserKNN, "User KNN")

    res = evaluator.SampleTopNRecs(ml, testSubject=user_id)
    return res
Ejemplo n.º 20
0
 def __init__(self, modelName, dataPath):
     self.modelDict = {
         "KNNBasic": KNNBasic(),
         "KNNWithMeans": KNNWithMeans(),
         "KNNWithZScore": KNNWithZScore(),
         "SVD": SVD(),
         "SVDpp": SVDpp(),
         "NMF": NMF(),
         "SlopeOne": SlopeOne(),
         "CoClustering": CoClustering()
     }
     self.trainset = None
     self.testset = None
     self.data = None
     self.model = self.modelDict[modelName]
     self.loadData(os.path.expanduser(dataPath))
Ejemplo n.º 21
0
def data_model_forcrossvalidation(data, config_train):
    """Creates data and empty model for use in kfold_crossvalidation function
    
    Arguments:
        data {pd.DataFrame} -- Pandas DataFrame
        config_train {dict} -- Dictionary of configurations corresponding to the train_model script
    
    Returns:
        data {surprise.dataset.DatasetAutoFolds} -- Surprise Dataset ready for cross validation
        model {surprise.prediction_algorithms.knns.KNNBasic} -- Surprise KNNBasic Model
    """
    t_configs = config_train['build_trainset'] #configurations for trainset
    data = data[t_configs['colnames']] #colnames configuration
    reader = Reader()
    data = Dataset.load_from_df(data, reader) #create surprise dataset
    model = KNNBasic(**config_train['create_KNNmodel']) #create knnmodel
    return data, model
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False):
    print("\n###### Compute CollaborativeFiltering_User_User ######")
    df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader)
    trainSet, testSet = train_test_split(data, test_size=.2, random_state=0)

    # compute  similarities between items
    sim_options = {'name': 'cosine', 'user_based': True}

    if knnmeans:
        algo = KNNWithMeans(sim_options=sim_options, verbose=False)
    else:
        algo = KNNBasic(sim_options=sim_options, verbose=False)
    algo.fit(trainSet)
    predictions = algo.test(testSet)

    Evaluators.RunAllEvals(predictions, benchmark)
Ejemplo n.º 23
0
    def __init__(self, df, algo='KNN', user_based=False):
        self.df = df
        self.algo = algo
        self.user_based = user_based

        reader = Reader(line_format='user item rating')
        data = Dataset.load_from_df(df=self.df, reader=reader)
        self.eval_data = EvaluationData(data)

        if self.algo == 'KNN':
            sim_options = {'name': 'cosine', 'user_based': self.user_based}
            self.model = KNNBasic(sim_options=sim_options)
        elif self.algo == 'SVD':
            self.model = SVD()
        elif self.algo == 'SVD++':
            self.model = SVDpp()
        elif self.algo == 'Random':
            self.model = NormalPredictor()
Ejemplo n.º 24
0
def knn_running_time(data):
    '''
        Calculates the running times for training and predictions for Basic KNN

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnBasictrain: running time for training
            elapsedtime_KnnBasictest: running time for predictions on testset
    '''
    elapsedtime_KnnBasictrain = []
    elapsedtime_KnnBasictest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNBasic,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knn = KNNBasic(k=k,
                       name=sim,
                       min_support=min_support,
                       user_based=user_based)
        knn.train(training)
        elapsedtime_KnnBasictrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knn.test(testing)
        elapsedtime_KnnBasictest.append(time.time() - test_start)
    return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
Ejemplo n.º 25
0
def main():
    book_df = pd.read_csv("../../data/processed/filtered_ratings.csv")
    # Reader object and rating scale specification
    book_df = book_df.drop('Unnamed: 0', axis=1)
    reader = Reader(rating_scale=(1, 5))
    # Load data
    data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]],
                                reader)

    # Spilt data into train and test sets
    train_set, test_set = train_test_split(data, test_size=0.20)

    algorithm_list = [
        NormalPredictor(),
        BaselineOnly(),
        KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)),
        SVDpp(),
        SVD(),
        NMF()
    ]

    # # Fit model for normal predictor and get rmse
    # basic_model_based(train_set, test_set, NormalPredictor())
    #
    # # Fit model for Baselineonly algorithm
    # basic_model_based(train_set, test_set, BaselineOnly())
    #
    # # Fit model for KNN algorithms
    # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)))
    #
    # plot_for_rmse(train_set, test_set)
    # Crossvalidation results
    # res = crossvalidate(data)
    # print(res)
    results = {}
    for algo in algorithm_list:
        rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo)
        print("Algorithm:", algo, preci, recall, f1)
        print(
            "**------------------------------------------------------------------------------------------**"
        )
Ejemplo n.º 26
0
    def get(self):
        (gb, evaluationData, rankings) = self.LoadGoodBooksData()
        evaluator = Evaluator(evaluationData, rankings)
        ItemKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
        evaluator.AddAlgorithm(ItemKNN, "Item KNN")
        evaluator.Evaluate(False)

        book_ids = evaluator.SampleTopNRecs(gb, testSubject=12)

        conn = self.get_db()
        cur = conn.cursor()
        # convert each book_id to a string, then join them with ', ' as a seperator.
        SQL_book_ids = ', '.join([str(x) for x in book_ids])
        cur.execute(
            "SELECT * FROM books WHERE book_id IN ({})".format(SQL_book_ids))
        books = cur.fetchall()
        conn.commit()

        return books
Ejemplo n.º 27
0
    def checkBestAlgorithm(self):
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        benchmark = []
        rmseTuple = []
        # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다.
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # 교차검증을 수행하는 단계.
            results = cross_validate(algorithm,
                                     data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # 결과 저장과 알고리즘 이름 추가.
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            rmseTuple.append((algorithm, tmp['test_rmse']))
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)
        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
        print("\n")
        rmseTuple.sort(key=lambda x: x[1])

        print("Best algorithm : ")
        print(str(rmseTuple[0]).split(' ')[0].split('.')[-1])
        return rmseTuple[0]
Ejemplo n.º 28
0
def build_collabrative_model(userData,mode='svd'):
	mode_opts = ['knn','knn_with_means','svd','svd++']
	assert mode in mode_opts, "Invalid mode. Choose from "+str(mode_opts)

	from surprise import Reader, Dataset
	# from surprise.model_selection import cross_validate, train_test_split
	# from surprise import accuracy

	reader = Reader()
	userData = Dataset.load_from_df(userData[['userId', 'movieId', 'rating']].astype('str'), reader)
	
	# trainset, testset = train_test_split(userData, test_size=0)
	trainset = userData.build_full_trainset()

	model = None

	if mode == "knn":
		from surprise import KNNBasic
		model = KNNBasic(verbose=True)

	elif mode=='knn_with_means':
		from surprise import KNNWithMeans

		# To use item-based cosine similarity use user_based = False
		sim_options = {
		    "name": "cosine",
		    "user_based": True,  # Compute  similarities between items
		}
		model = KNNWithMeans(verbose=True,sim_options=sim_options)
	

	elif mode == "svd":
		from surprise import SVD
		model = SVD(verbose=True)

	elif mode == "svd++":
		from surprise import SVDpp
		model = SVDpp(verbose=True)

	
	model.fit(trainset)
	return model
def knn_basic_movie(train, test, ids, Xtest, Xids):
    """
    kNN basic approach on movies
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Basic Movie')
    algo = KNNBasic(k=21,
                    name='msd',
                    min_support=2,
                    user_based=False,
                    verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
def set_algo(name="cosine", user_based=True, algo_type="KNNBasic"):
    '''Function to facilitate switching between different algorithms
    '''

    # To use item-based cosine similarity
    sim_options = {
        "name": name,
        "user_based": user_based,  # Compute similarities between user or items
    }
    if algo_type == "KNNBasic":
        algo = KNNBasic(k=10, min_k=1, sim_options=sim_options)

    elif algo_type == "KNNWithMeans":
        algo = KNNWithMeans(k=10, min_k=1, sim_options=sim_options)

    elif algo_type == "KNNWithZScore":
        algo = KNNWithZScore(k=10, min_k=1, sim_options=sim_options)

    else:
        raise NameError('Unknown algorithm type.')

    return algo
Ejemplo n.º 31
0
    def __init__(self, MainDir, ExpName):
        self.MODEL_NAME = "MODEL_NAME"
        self.FEATURE_PATH = "FEATURE_PATH"
        self.TEST_FEATURE_PATH = "TEST_FEATURE_PATH"
        self.MODEL_DICT = {
            "KNNBasic": KNNBasic(),
            "KNNWithMeans": KNNWithMeans(),
            "KNNWithZScore": KNNWithZScore(),
            "SVD": SVD(),
            "SVDpp": SVDpp(),
            "NMF": NMF(),
            "SlopeOne": SlopeOne(),
            "CoClustering": CoClustering()
        }
        self.TOP_RECOMMEND_RESULT_NUM = "TOP_RECOMMEND_RESULT_NUM"
        self.MODEL_PATH = "MODEL_PATH"
        self.HYPER_PARAMETER = "HYPER_PARAMETER"
        self.ONLINE_EXP_TYPE = "ONLINE"
        self.OFFLINE_EXP_TYPE = "OFFLINE"
        self.CONFIG_RELATIVE_PATH = "/inference/configuration/config.json"

        self.CONTENT_CONFIG = "CONTENT_CONFIG"
        self.REC_NUM = "REC_NUM"
        self.CONTENT_FEATURE_PATH = "CONTENT_FEATURE_PATH"

        self.MAIN_DIR_PATH = MainDir
        self.ExpName = ExpName
        self.ExpType = None
        self.trainset, self.testset, self.rawMovieList, self.rawUserList = None, None, None, None
        self.loadConfig(self.MAIN_DIR_PATH + self.CONFIG_RELATIVE_PATH,
                        ExpName)
        if self.MODEL_NAME in self.config and self.config[
                self.MODEL_NAME] in self.MODEL_DICT:
            self.model = self.MODEL_DICT[self.config[self.MODEL_NAME]]
        else:
            raise AttributeError("Model Initilization error")
        self.contentModel = ContentBaseModel.ContentModel(
            self.content_config[self.REC_NUM], self.MAIN_DIR_PATH +
            self.content_config[self.CONTENT_FEATURE_PATH])