Ejemplo n.º 1
0
    def __build_model(self):
        model_path = '{}{}'.format(self.file_prefix, self.model_path)
        try:
            model = joblib.load(model_path)
            print('recommender exists, load it')
            return model
        except Exception as e:
            print('recommender does not exist, build new recommender')

            # load data

            # initialize KNN recommender
            algo = KNNWithMeans(k=50,
                                sim_options={
                                    'name': 'pearson_baseline',
                                    'user_based': False
                                })
            # train model
            algo.fit(self.trainset)
            # save model
            joblib.dump(algo, model_path)
            # validation
            test_pred = algo.test(self.testset)
            accuracy.rmse(test_pred)

            return algo
        def cal_KNNWithMeans(trainset, df):
            # KNNWithMeans

            sim_options = {'name': 'cosine', 'user-based': True}
            algo_knnm = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
            algo_knnm.fit(trainset)
            users = []
            items = []
            real = []
            estimate = []
            for i in range(len(df)):
                uid = df[i:i + 1].user.values[0]
                users.append(uid)
                iid = df[i:i + 1].store.values[0]
                items.append(iid)
                r_ui = df[i:i + 1].stars.values[0]
                real.append(r_ui)
                pred = algo.predict(uid, iid, r_ui, verbose=True)
                estimate.append(pred)
            print("end")
            # knn With Means
            df4 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est'])
            df4['user'] = users
            df4['item'] = items
            df4['r_ui'] = real
            df4['est'] = estimate
            #df3.head()
            df4['est'] = df4['est'].apply(lambda x: x[-2])
            df4['err'] = abs(df4.est - df4.r_ui)
            df4.to_csv(save_file2)
    def fit(self, trainset):
        """Model fitting for KNN with significance weighting

        Calls the parent class fit method and then generates the overlap matrix
        needed by the significance weighting.

        :param trainset:
        :return: self
        """

        # Call parent class function
        KNNWithMeans.fit(self, trainset)

        # Create an "overlap" matrix counting the number of items that
        # pairs of users have in common.
        # See the creation of the "freq" matrix in the "similarities.pyx" file.
        if self.sim_options['user_based']:
            n_x, yr = self.trainset.n_users, self.trainset.ir
        else:
            n_x, yr = self.trainset.n_items, self.trainset.ur

        self.overlap = np.zeros((n_x, n_x), np.int)
        for y, y_ratings in iteritems(yr):
            for xi, ri in y_ratings:
                for xj, rj in y_ratings:
                    self.overlap[xi, xj] += 1
        
        # Use overlap matrix to update the sim matrix, discounting by the significance weight factor.
        for xi in range(n_x):
            for xj in range(n_x):
                weight = self.sig_weight(xi, xj)
                self.sim[xi, xj] = self.sim[xi, xj] * weight
        return self
Ejemplo n.º 4
0
def main():

    # Charge movielens-100k dataset
    movielens_ds = Dataset.load_builtin('ml-100k')

    # Creer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(movielens_ds, test_size=.15)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result = []
    for prediction in predictions:
        # Difference prediction et realite
        result.append(prediction.r_ui - prediction.est)

    # Histogramme du resultat
    plt.hist(result, 100)

    plt.show()
Ejemplo n.º 5
0
    def recommender_knn_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_knn_means = KNNWithMeans(verbose=False)

        algo_knn_means.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_knn_means = algo_knn_means.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('KNN_BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_knn_means, verbose=False)) + ' MAE ' +
              str(mae(predictions_knn_means, verbose=False)))

        return algo_knn_means
    def evaluate_on_test(self, train_set, test_set):
        """
        Evaluate the algorithm on the test set after running it on the test set
        :param train_set:
        :param test_set:
        :return: RMSE value on test set
        """
        if train_set is not None and test_set is not None:
            print("Evaluate RMSE on test data")
            self.LOG_HANDLE.info("Evaluate RMSE on test data")

            similarity_options = {
                'name': 'msd',
                'user_based': False,
            }

            # Use the KNN algorithm
            algo = KNNWithMeans(sim_options=similarity_options)

            # Train the algorithm on the trainset, and predict ratings for the testset
            algo.fit(train_set)
            predictions = algo.test(test_set)

            # Then compute RMSE
            return accuracy.rmse(predictions)
Ejemplo n.º 7
0
def knnBasico(df, testSize, vecinos, pr, bool):

    # df = pd.read_csv('../datasets/yelp_beautySpa_aspects.csv', header=0)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)
    trainset, testset = train_test_split(data,
                                         test_size=testSize,
                                         shuffle=False)

    sim_options = {
        'name': 'cosine',
        'user_based': bool  # compute  similarities between items
    }
    algo = KNNWithMeans(k=vecinos, sim_options=sim_options)

    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, pr, 4)

    # Precision and recall can then be averaged over all users
    # print(sum(prec for prec in precisions.values()) / len(precisions))
    # print(sum(rec for rec in recalls.values()) / len(recalls))

    precision = round(
        sum(prec for prec in precisions.values()) / len(precisions), 3)
    recall = round(sum(rec for rec in recalls.values()) / len(recalls), 3)

    return precision, recall
Ejemplo n.º 8
0
def main():
    # Charge movielens-100k dataset
    data = Dataset.load_builtin('ml-100k')

    # Créer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(data, test_size=.15)

    # Détermine l'algorithme utilisé
    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    result =[]
    for prediction in predictions:
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les predictions et la réalité
    plt.hist(result, 100)

    plt.show()
def KNNPred(data):  #KNN Means algorithm
    print("\nTraining KNN Means model..\n")
    global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm
    options = model_params[0]
    knnModel = KNNWithMeans(sim_options=options)
    knnModel_1 = KNNWithMeans()
    train = data.build_full_trainset()
    knnModel.fit(train)
    print("\nTraining done..\nPrediction started..")
    knnModel_1.fit(train)
    #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    y_pred_w_m = [0 for i in range(testlen)]
    y_pred_wo_m = [0 for i in range(testlen)]
    kk = 0
    for i in x_test:
        if i[1] - 1 in cold_itm:
            y_pred_w_m[kk] = avg_rat[i[0] - 1]
            y_pred_wo_m[kk] = avg_rat[i[0] - 1]
        else:
            y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est
            y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est
        kk += 1
    #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)]
    #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)]
    print("\nPrediction done..\n")
    return [y_pred_w_m, y_pred_wo_m, knnModel,
            knnModel_1]  #, y_pred_train, y_pred_tot
Ejemplo n.º 10
0
 def KNN_train(self,
               k=20,
               options={
                   'name': 'pearson',
                   'user_based': False
               }):
     '''
     seed:int-3划分训练集测试集的随机种子
     k:int-40,最大邻居数量
     options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法
     '''
     self.algos = []
     df = self.trainDatas
     names = locals()
     r = Reader(rating_scale=(1, 5))
     # 读取、划分数据;训练预测数据
     total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r)
     total_train = total.build_full_trainset()
     total_algo = KNNWithMeans(k, sim_options=options)
     total_algo.fit(total_train)
     self.algos.append(total_algo)
     for i in range(1, self.no_of_criteria + 1):
         names['c' + str(i)] = Dataset.load_from_df(
             df[['uid', 'iid', 'c' + str(i)]], reader=r)
         names['c' + str(i) +
               '_train'] = names.get('c' + str(i)).build_full_trainset()
         names['algo_c' + str(i)] = KNNWithMeans(k, sim_options=options)
         names.get('algo_c' + str(i)).fit(names.get('c' + str(i) +
                                                    '_train'))
         self.algos.append(names.get('algo_c' + str(i)))
Ejemplo n.º 11
0
class Rater:
    def __init__(self, ratings):
        self.classifier = KNNWithMeans(sim_options={"name": "cosine", "user_based": False})
        self.training_set = None
        self.ratings_dict = None
        self._prepare_data_(ratings)
        self._train_()

    def _prepare_data_(self, ratings):
        self.ratings_dict = {
            "user_id": [item.user_id for item in ratings],
            "movie_id": [item.movie_id for item in ratings],
            "mark": [item.mark for item in ratings]
        }
        df = pd.DataFrame(self.ratings_dict)
        data = Dataset.load_from_df(df[["user_id", "movie_id", "mark"]], Reader(rating_scale=Constants.RATING_SCALE))
        self.training_set = data.build_full_trainset()

    def _train_(self):
        self.classifier.fit(self.training_set)

    def get_ratings(self, user_id):
        predicted_ratings = {}
        for movie_id in self.ratings_dict["movie_id"]:
            prediction = self.classifier.predict(user_id, movie_id)
            predicted_ratings[movie_id] = prediction.est
        return predicted_ratings
Ejemplo n.º 12
0
def plot_ROC(qNum, k, thresh=[2.5,3,3.5,4]):
    range = 5.0
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
    model.fit(trainset)
    predictions = model.test(testset)
    
    for thrs in thresh:
        y = np.array([])
        scores = np.array([])
        for u, i, t, est, d in predictions:
            if t >= thrs:
                t = 1
            else:
                t = 0
            y = np.append(y, t)
            scores = np.append(scores, est/range)
        
        fpr, tpr, thresholds = metrics.roc_curve(y, scores)
        roc_auc = metrics.auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Threshold = '+str(thrs))
        plt.show()
        print("auc = "+str(roc_auc))
Ejemplo n.º 13
0
def test_knn_based(data):
    """
    Parameters
    ----------
    data : dataframe
        Dataframe with columns userId, movieId, and rating in that order.

    Returns
    -------
    test_mse : float
        The mean squared error for the knn based algorithm.

    """
    reader = Reader(rating_scale=(1, 5))
    knn_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(knn_data,
                                         test_size=.10,
                                         random_state=24)
    algo = KNNWithMeans(k=5,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)
    predictions = algo.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
Ejemplo n.º 14
0
    def fit(self, trainset):
        """Model fitting for KNN with significance weighting

        Calls the parent class fit method and then generates the overlap matrix
        needed by the significance weighting.

        :param trainset:
        :return: self
        """

        # Call parent class function
        KNNWithMeans.fit(self, trainset)
        # Create an "overlap" matrix counting the number of items that
        # pairs of users have in common.
        ur_data = trainset.ur
        n_d = len(ur_data)
        overlap = np.zeros([n_d, n_d], np.double)
        # See the creation of the "freq" matrix in the "similarities.pyx" file.
        # Use overlap matrix to update the sim matrix, discounting by the significance weight factor.
        self.ur_data = ur_data
        self.overlap = np.zeros([n_d, n_d], np.int)

        for u in range(n_d):
            for v in range(n_d):
                if (u != v):
                    overlap[u, v] = self.sig_weight(u, v)
        self.sim = overlap * self.sim
        return self
Ejemplo n.º 15
0
def DisplayGraphDelta(data) : 
    """
        Affichage du delta entre prédiction et réalité
    """
    # Créer un jeu de test et de train ( 25%, 75%)
    trainset, testset = train_test_split(data, test_size=.25)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result =[]
    for prediction in predictions:
        print(prediction)
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les prediction et la réalité
    print(len(result))
    plt.hist(result, 100)
    plt.show()
Ejemplo n.º 16
0
def train():

    # TODO put in real data here when we have collected enough
    ratings_dict = {
        "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
        "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
        "rating": [1, 0, 0, 0, 1, 0, 1, 1, 1],
    }

    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(0, 1))

    # Loads Pandas dataframe
    data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

    trainingSet = data.build_full_trainset()

    # To use item-based cosine similarity
    sim_options = {
        "name": "cosine",
        "user_based": False,  # Compute  similarities between items
    }
    algo = KNNWithMeans(sim_options=sim_options)

    algo.fit(trainingSet)

    return algo
Ejemplo n.º 17
0
    def CFM(self):
        u_id = []
        I_id = []
        r_ui_ = np.array([])
        _est = np.array([])

        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)
                u_id.append(int(pred.uid))
                I_id.append(int(pred.iid))
                r_ui_ = np.append(r_ui_, pred.r_ui)
                _est = np.append(_est, pred.est)

        self.df_est = pd.DataFrame({
            'uid': u_id,
            'Iid': I_id,
            'r_ui': r_ui_,
            'est': _est
        })
        self.arr = self.df_est['uid'].unique()

        self.CFWM_ndcg_ = self.Calculate_NDCG()
Ejemplo n.º 18
0
def load_data():
    data = Dataset.load_builtin('ml-100k')
    # similarity options
    sim_options = {"name": "msd", "user_based": False}

    param_grid = {
        "n_epochs": [5, 10],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    # algorithm
    algo = KNNWithMeans(sim_options=sim_options)

    # computation
    training_set = data.build_full_trainset()

    algo.fit(training_set)

    # GRID SEACH, MATRIX FACTORIZATION
    print("Divide matrix in grids")
    gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
Ejemplo n.º 19
0
def rank_predictions(model_name):

    k_KNN = 22 
    k_NNMF = 20
    k_MF = 26

    if model_name == 'KNN':
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0
        }
        model = KNNWithMeans(k_KNN, sim_options=sim_options)
    elif model_name == 'NNMF':
        model = NMF(n_factors= k_NNMF)
    else:
        model = SVD(n_factors = k_MF)

    precision_arr = []
    recall_arr = []
    for t in range (1,26):
        kf = KFold(n_splits=10)
        print(t)
        p = []
        r = []
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall (predictions, t)
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
            
        precision_arr.append(np.mean(np.array(p)))
        recall_arr.append(np.mean(np.array(r)))

    # precision vs t
    plt.plot(list(range (1,26)), precision_arr)
    plt.xlabel("Size")
    plt.ylabel("Precision")
    plt.title("The average precision plot using " + model_name)
    plt.show()
    
    # recall vs t
    plt.plot(list(range (1,26)), recall_arr)
    plt.xlabel("Size")
    plt.ylabel("Recall")
    plt.title("The average recall plot using MF " + model_name)
    plt.show()
    
    # precision vs recall 
    plt.plot(recall_arr, precision_arr)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("The average precision and recall plot using " + model_name)
    plt.show()


    return precision_arr, recall_arr 
Ejemplo n.º 20
0
def get_rec_sys_resources(df_reviews):
    sim_options = {'name': 'pearson', 'user_based': False}
    algo = KNNWithMeans(sim_options=sim_options)

    # load csv to build trainset, required to recommend
    cols = ['reviewerID', 'asin', 'overall']
    trainset, testset = train_test_from_df(df_reviews, cols, test_size=0.2)

    algo.fit(trainset)
    return algo, algo.compute_similarities(), trainset, testset
Ejemplo n.º 21
0
def train():
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(book_rating_ds[['user', 'item', 'rating']],
                                reader)
    sim_options = {"name": "cosine", "user_based": False}
    model = KNNWithMeans(sim_options=sim_options)
    training_Set = data.build_full_trainset()
    model.fit(training_Set)
    # export the model
    model_path = os.path.join(PICKLES_PATH, "rec.pkl")
    joblib.dump(model, model_path, compress=True)
    def run(self): #will run model
        ratings = pd.read_csv('rating_final.csv')
        ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)}
        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 2))
        data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader)

        # To use item-based cosine similarity
        sim_options = {
            "name": "cosine",
            "user_based": True,  # Compute  similarities between items
            "min_support":9
        }
        # define a cross-validation iterator
        kf = KFold(n_splits=5)
        algo = KNNWithMeans(sim_options=sim_options)
        places = list(df['placeID'].unique())
        ordered = ArrayList()
        for i in places:
            total=0
            for trainset, testset in kf.split(data): #finds result for each fold
                # train algorithm.
                algo.fit(trainset)
                #test algorithm
                #predictions = algo.test(testset)
                # Compute and print Root Mean Squared Error
                #accuracy.rmse(predictions, verbose=True)

                #gets predicted rating for each place
                prediction = algo.predict(self.user, i, verbose=False)
                total+=prediction.est
            ordered.append(i, total/5) #we find average of estimate for each fold

        ordered.sort()
        highest = ordered.inArray[ordered.count - 5:ordered.count]

        place = pd.read_csv('geoplaces2.csv')

        #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)})
        count = 0
        finalRec=ArrayList()
        for i in range(len(highest) - 1, -1, -1):
            count += 1
            name = list(place[place["placeID"].unique() == highest[i].id]['name'])
            finalRec.append(count, name[0])

        #printing accuracy score
        out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
        mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
        print(mean_rmse)

        return finalRec.inArray
def train_surprise_model():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')
    df = df[['user_key', 'game_key', 'rating']]

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainsetfull = data.build_full_trainset()
    print('Number of users: ', trainsetfull.n_users, '\n')
    print('Number of items: ', trainsetfull.n_items, '\n')

    # Parameters:
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 10
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    # Run fit:
    start_time = time.time()
    algo.fit(trainsetfull)
    print("--- %s seconds ---" % (time.time() - start_time))

    ### Test: is it possible to exchange the sim matrix?
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    a = algo.predict(93681, 100007)
    algo.sim = sim_matrix_imported
    b = algo.predict(93681, 100007)

    # We now need to save the similarity matrix somewhere:
    sim_matrix = algo.sim
    pd.DataFrame(sim_matrix).to_csv(
        '../Data/Recommender/sim_matrix-myKNNWithMeans_item_based_model')

    # Save the precomputed model:
    dump.dump('../Data/Recommender/myKNNWithMeans_item_based_model', algo)
Ejemplo n.º 24
0
def binary_value(data, threshold) :
    trainset, testset = train_test_split(data, test_size=.1)
    
    algo = KNNWithMeans(k = 30)
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    like0 = []#real
    like  = []#predict
    for row in range(len(predictions)) :
        like.append( 1 if predictions[row][3] > threshold else 0)
        like0.append(1 if predictions[row][2] > threshold else 0)
    #predictions[row][3] -> predict value
    #predictions[row][2] -> real value
    return like0, like
    def CFM(self):
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)

        return pred
Ejemplo n.º 26
0
def trim_performance(qNum,maxk=0): 
    pop, unpop, highVar = trimMovies()
    
    if maxk == 0:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50

    trim_Model = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
    }
    trimSet, modelName = trim_Model[qNum]
    
    kf = KFold(n_splits=10)
    RMSE = [] 
    for k in range(2, maxk + 1, 2):
        print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20)
        
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
        elif modelName == 'NMF':
            model = NMF(n_factors=k)

        subRMSE = [] 
        temp = 1
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet))
            print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet))
            temp += 1
            predictions = model.test(testSet)
            subRMSE.append(accuracy.rmse(predictions, verbose=True))
        RMSE.append(np.mean(subRMSE))

    plt.figure()
    plt.plot(list(range(2, maxk+1, 2)), RMSE)
    plt.xlabel("k")
    plt.ylabel("Average RMSE")
    plt.title("Q"+str(qNum)+": Average RMSE Along k")
    plt.show()
    print(min(RMSE))
    return min(RMSE)
def solve_item_item(pathw):
    reader = Reader(line_format='user item rating timestamp', sep=',')
    data = Dataset.load_from_file(pathw, reader=reader)
    data.split(n_folds=5)
    algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions, n=10)
    # Print the recommended items for each user
    for uid, user_ratings in top_n.items():
        if uid == '615':
            # print(uid, [iid for (iid, _) in user_ratings])
            return [iid for (iid, _) in user_ratings]
Ejemplo n.º 28
0
    def CFM(self):
        kf = KFold(n_splits=5)
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

        for trainset, testset in kf.split(self.data):
            algo.fit(trainset)
            predictions = algo.test(testset)

            precisions, recalls = self.precision_recall_at_k(predictions)

            P = sum(prec for prec in precisions.values()) / len(precisions)
            R = sum(rec for rec in recalls.values()) / len(recalls)
            F1 = 2 * P * R / (P + R)

            print("Precision : ", P)
            print("Recall    : ", R)
            print("F1        : ", F1)
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False):
    print("\n###### Compute CollaborativeFiltering_User_User ######")
    df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader)
    trainSet, testSet = train_test_split(data, test_size=.2, random_state=0)

    # compute  similarities between items
    sim_options = {'name': 'cosine', 'user_based': True}

    if knnmeans:
        algo = KNNWithMeans(sim_options=sim_options, verbose=False)
    else:
        algo = KNNBasic(sim_options=sim_options, verbose=False)
    algo.fit(trainSet)
    predictions = algo.test(testSet)

    Evaluators.RunAllEvals(predictions, benchmark)
Ejemplo n.º 30
0
def algoProdToProd():
    reader2 = Reader(rating_scale=(0, productTable['Frequency'].max()))
    data2 = Dataset.load_from_df(
        productTable[["Product_ID1", "Product_ID2", "Frequency"]], reader2)

    # To use item-based cosine similarity
    sim_options = {
        "name": "cosine",
        "user_based": False,  # Compute  similarities between items
    }

    algo2 = KNNWithMeans(sim_options=sim_options)

    trainingSet2 = data2.build_full_trainset()

    algo2.fit(trainingSet2)

    return algo2
Ejemplo n.º 31
0
from surprise import KNNWithMeans
from surprise import Dataset, print_perf, Reader
from surprise.model_selection import cross_validate
import os

# 指定文件所在路径
file_path = os.path.expanduser('mydata.csv')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating', sep=',')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个
algo.fit(trainset)

# we can now query for specific predicions
uid = str(5)  # raw user id
iid = str(1)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1

#----------------------------
uid = str(5)  # raw user id
iid = str(5)  # raw item id
# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)