def final_model(data):
    """Pickles the collaborative filtering recommendation system model for repeat customers.

    Args:
    data -- a dataframe containing user id, item id, and ratings columns in that order.
    """
    # Creates a user ratings surprise matrix for fitting model
    user_ratings_matrix = surprise_df(data)

    # Splits dataset into train and test datasets to generate predictions
    train_set, test_set = train_test_split(user_ratings_matrix,
                                           test_size=0.2,
                                           random_state=19)

    # Best params determined using GridSearchCV
    params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])

    svdpp.fit(train_set)
    predictions = svdpp.test(test_set)

    # Use surprise wrapper to pickle model
    dump.dump('repeat_customer_model',
              predictions=predictions,
              algo=svdpp,
              verbose=0)
Exemple #2
0
def trainSVD_surprise3D(
        training_data,
        colorlabels,
        plot=True,
        savefig="figures/"):  #colorlabels, sizelabels, plot=True, savefig=True
    # algo = SVD(n_factors=4, n_epochs=1000, biased=True)
    # algo = SVD(n_factors=20, n_epochs=500, biased=False)
    algo = SVDpp(n_factors=10, n_epochs=1000)
    algo.fit(training_data)
    U = algo.pu
    if plot:
        fig = plt.figure(figsize=(8, 8))
        # ax = fig.add_subplot(1,1,1)
        ax = fig.add_subplot(111, projection='3d')
        ax.set_xlabel('First', fontsize=15)
        ax.set_ylabel('Second', fontsize=15)
        ax.set_title('Reduced SVD', fontsize=20)
        scatter = ax.scatter(
            U[:, 0], U[:, 1], U[:, 2], c=colorlabels, s=10, alpha=0.7
        )  #explore labeling colors with features like demographics, age
        ax.grid()
        cbar = fig.colorbar(scatter, ax=ax)
        cbar.set_label("state")
        if savefig:
            plt.savefig(savefig + "svd_counties_3D")
        plt.show()
def svdPP(data):  #SVDPP algorithm
    print("\nTraining SVDPP model..\n")
    global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm
    p1, p2, p3 = [
        model_params[1]['n_epochs'], model_params[1]['lr_all'],
        model_params[1]['reg_all']
    ]
    svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3)
    svdModel.fit(data.build_full_trainset())
    print("\nTraining done..\nPrediction started..")
    test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)]
    #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)]
    #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)]
    predict = svdModel.test(test)

    #trainset, testset = t_t_s(data, test_size=.25)
    svdModel_1 = SVDpp()
    svdModel_1.fit(data.build_full_trainset())
    predict1 = svdModel_1.test(test)
    #predict_train = svdModel_1.test(train_)
    #predict_tot = svdModel_1.test(total_)
    usrA = [int(i[0]) - 1 for i in predict]
    itmA = [int(i[1]) - 1 for i in predict]
    res = [i[3] for i in predict]
    res1 = [i[3] for i in predict1]
    for i in range(testlen):
        if itmA[i] in cold_itm:
            res[i] = avg_rat[usrA[i]]
            res1[i] = avg_rat[usrA[i]]
    #restrain=[i[3] for i in predict_train]
    print("\nPrediction done..\n")
    return [res, res1, svdModel, svdModel_1]  #,restrain, predict_tot
Exemple #4
0
def SVDPP(PointFrame, RecommendNum=10, TypeNum=5):
    OutUserList = []
    OutFundList = []
    PointFrameList = []
    UserType = 0
    # 拆分评分矩阵为5类:
    for Type in range(5):
        PointFrameList.append(PointFrame.ix[PointFrame.Type == Type])
    # 对每一类用户分别评分:
    for Frame in PointFrameList:
        Frame = Frame.loc[:, 'User':'******']
        UserList = Frame.User.unique()
        FundList = Frame.FundCode.unique()
        UserType = UserType + 1
        reader = Reader(rating_scale=(0, 2))
        data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset()
        if UserType == 4:
            model = SVDpp(n_factors=5)
        else:
            model = SVDpp()
        model.fit(data)
        for User in UserList:
            UserPointList = []
            for Fund in FundList:
                UserPointList.append(model.predict(User, Fund).est)
            RecommendList = np.argsort(UserPointList)[::-1][0:RecommendNum]
            for FundIndex in RecommendList:
                OutUserList.append(User)
                OutFundList.append(FundList[FundIndex])
    OutFrame = pd.DataFrame({
        "User": OutUserList,
        "RecommendFundCode": OutFundList
    })
    return OutFrame
Exemple #5
0
class RecommenderSVDpp(Recommender):
    def __init__(self, recommendation_dataset: RecommendationDataSet):
        super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies)
        self.algorithm = SVDpp()
        self.recommendation_dataset = recommendation_dataset

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20):
        # get dataset 
        new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched)
        inner_user_id = full_dataset.to_inner_uid(new_user_id)

        # after new dataset we need again train our model with the new user for the whole 
        # dataset with the new user.
        self.algorithm.fit(full_dataset)

        # watched movies
        watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()}

        # Calculate for all similar user, predictions
        test_items = [
            self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i))
            for i in range(0, full_dataset.n_items)
            if i not in watched
        ]

        topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]]
        return self.movies.get_movie_by_movie_ids(topn_items)
Exemple #6
0
def time_location_model(df):
    """
        Shows the performance of model based on just bias
    """
    lower = df['date_dist_rating'].min()
    upper = df['date_dist_rating'].max()
    df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1)

    reader = Reader(rating_scale=(lower, upper))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    # breakpoint()
    trainset, testset = train_test_split(data)

    algo = SVDpp()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
Exemple #7
0
def top_ten_df (df):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''

    data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    # First train an SVD algorithm on entire dataset (choose 6x name filter)
    trainset = data.build_full_trainset()
    algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM
    predictions = algo.test(testset)

    #create a dictionary of predictions
    top_n = f.get_top_n(predictions, n=10)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df
Exemple #8
0
def SVDpp_calculation(data , trainset, testset, time, cv):
    start = time.time()
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True)
    end = time.time()
    time = end-start
    
    return time, cross_validate_svdpp_dict
Exemple #9
0
def svdpp(trainset, testset):
    # Matrix factorization - SVD++
    print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " +
          "-" * 5)
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Exemple #10
0
class TrainModel:

    # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'):
    #
    #     self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs},
    #                          sim_options={'name': sim_option, 'user_based': False})
    def __init__(self, lr_all=0.006, n_epochs=40):
        self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs)
        self.reader = Reader(rating_scale=(0, 1))
        self.filename = 'trained_model.pkl'

    def read_from_df(self, dataframe, user_col, item_col, rating_col):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        trainset = data.build_full_trainset()
        return trainset

    def train_mod(self, dataframe, user_col, item_col, rating_col):
        self.algo.fit(
            self.read_from_df(dataframe, user_col, item_col, rating_col))

    def dump_model(self, predictions):
        saved_ent = dump.dump(self.filename,
                              algo=self.algo,
                              predictions=predictions)
        return saved_ent

    def load_model(self):
        predictions, loaded_ent = dump.load(self.filename)
        return predictions, loaded_ent

    def get_user_pred(self,
                      user_id,
                      dataframe,
                      user_col,
                      item_col,
                      rating_col,
                      n=2):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        testset = data.build_full_trainset().build_anti_testset()
        predictions = self.algo.test(testset)
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        return predictions, top_n[:n]

    def get_user_pred_stable(self, user_id, predictions, n=2):
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]}
        return top_n[:n]
Exemple #11
0
class SvdPP(RecommenderBase):

    """
        SVDpp algorithm.
        Actually woring bad, just a draft
    """

    def __init__(self, URM):

        print('train set built')
        # double check if training set is built fine for sgd
        # for u, i, r in self.trainset.all_ratings():
        #     a = 1

    def fit(self, urm, n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, init_mean=0,
            init_std_dev=0.1, verbose=True):
        # create the training set
        r, c = urm.nonzero()
        ones = np.ones(len(r), dtype=np.int32)
        d = np.vstack((r, c, ones)).transpose()
        df = pd.DataFrame(d)
        df.columns = ['userID', 'itemID', 'rating']
        reader = Reader()
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
        self.trainset = data.build_full_trainset()

        # fit
        self.algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, init_mean=init_mean,
                          init_std_dev=init_std_dev, verbose=verbose)
        self.algo.fit(self.trainset)

    def recommend(self, userid, N=10, urm=None, filter_already_liked=True, with_scores=True, items_to_exclude=[]):
        if len(items_to_exclude) > 1:
            raise NotImplementedError('Items to exclude functionality is not implemented yet')

        r = np.empty([1])
        for i in range(d.N_TRACKS):
            p = self.algo.predict(userid, i)
            r = np.array([p[3]]) if i == 0 else np.concatenate((r, np.array([p[3]])))

        if filter_already_liked:
            if urm == None:
                raise ValueError('Please provide a URM in order to items already liked')
            else:
                r[urm.getrow(userid).nonzero()[1]] = 0

        l = [userid]
        ind = np.argpartition(r, -10)[-10:]
        for i in ind:
            if with_scores:
                l.append((i, r[i]))
            else:
                l.append(i)
        return l
Exemple #12
0
 def create_model(self):
     n = 1000000
     raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]]
     reader = Reader()
     data = Dataset.load_from_df(raw_data, reader)
     data.split(n_folds=5)
     svdpp = SVDpp()
     trainset = data.build_full_trainset()
     svdpp.fit(trainset)
     filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svdpp.sav"
     joblib.dump(svdpp, filename)
Exemple #13
0
def svdpp(dataset):
    start = time.time()
    algo = SVDpp()
    kf = KFold(n_splits=5)
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        acc = accuracy.rmse(predictions, verbose=True)
    end = time.time()
    print('svdpp花分钟数为:', (end - start) / 60)
    return acc
Exemple #14
0
def model(train_set, test_set):
    params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])
    svdpp.fit(train_set)

    predictions = svdpp.test(test_set)
    rmse = accuracy.rmse(predictions, verbose=False)

    return predictions, rmse
Exemple #15
0
def trainSVD_surprise(
        training_data,
        colorlabels,
        plot=True,
        simplify=False,
        savefig="figures/"):  #colorlabels, sizelabels, plot=True, savefig=True
    # algo = SVD(n_factors=4, n_epochs=1000, biased=True)
    # algo = SVD(n_factors=20, n_epochs=500, biased=False)
    algo = SVDpp(n_factors=3, n_epochs=1000)
    algo.fit(training_data)
    U = algo.pu
    if plot:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(1, 1, 1)
        ax.set_xlabel('First', fontsize=15)
        ax.set_ylabel('Second', fontsize=15)
        ax.set_title('Reduced SVD', fontsize=20)
        scatter = ax.scatter(
            U[:, 0], U[:, 1], c=colorlabels, s=10, alpha=0.7
        )  #explore labeling colors with features like demographics, age
        ax.grid()
        cbar = fig.colorbar(scatter, ax=ax)
        cbar.set_label("state")
        if savefig:
            plt.savefig(savefig + "svd_counties")
        plt.show()

    if simplify:
        U = U.transpose()
        A = np.linalg.svd(U)[0]
        U_proj = np.dot(A[:, :2].transpose(), U)
        # Rescale dimensions
        U_proj /= U_proj.std(axis=1).reshape(2, 1)
        if plot:
            fig = plt.figure(figsize=(8, 8))
            ax = fig.add_subplot(1, 1, 1)
            ax.set_xlabel('First', fontsize=15)
            ax.set_ylabel('Second', fontsize=15)
            ax.set_title('Reduced SVD', fontsize=20)
            scatter = ax.scatter(U_proj[0], U_proj[1], c=colorlabels, s=10)
            ax.grid()
            cbar = fig.colorbar(scatter, ax=ax)
            cbar.set_label("state")
            if savefig:
                plt.savefig(savefig + "svd_counties_simplfied")
            plt.show()
        return U_proj

    return U
def predict():
	global top_n
	global user_id
	print("--predict start--------------------------------")

	# dataset import
	rating_data = pd.DataFrame(get_default_ratings()) 
    

	reader = Reader(rating_scale=(0, 5))
	data = Dataset.load_from_df(df=rating_data, reader=reader)

	trainset_2, testset_2 = train_test_split(data, test_size=0.3)

	# print("--test2--------------------------------")


	algo = SVDpp()
	predictions = algo.fit(trainset_2).test(testset_2)

	# print("--test1--------------------------------")

	top_n = get_top_n(predictions, n=10)

	print("--predict end--------------------------------")
def fit_model(mlr_df):
    algo = SVDpp()
    # Object to parse the data
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(mlr_df[['userId', 'id', 'rating']], reader)
    trainset = data.build_full_trainset()
    PREDICTOR = algo.fit(trainset)
    return PREDICTOR
Exemple #18
0
    def run_colab_filter(self):
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 4))

        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(
            self.df20[['user_id', 'route_id', 'rating']], reader)

        # Retrieve the trainset.
        trainset = data.build_full_trainset()

        # Than predict ratings for all pairs (u, i) that are NOT in the training set.
        testset = trainset.build_anti_testset()

        algo_tuned = SVDpp(n_factors=20)
        algo_tuned.fit(trainset)

        iid = self.df20['route_id'].unique()
        #user_id = 200128311 #mine, trad, alpine, intermediate
        #user_id = 110596403 #boulder-er
        #user_id = 200272475 #boulder-er, advanced
        #user_id = 200077815 #michaels, trad, alpine, intermediate
        user_id = 106540415  #mixed climber, alpine climber, advanced
        iid_me = self.df20.loc[self.df20['user_id'] == user_id, 'user_id']
        iids_to_pred = np.setdiff1d(iid, iid_me)

        testset = [[user_id, iid, 2] for iid in iids_to_pred]
        predictions_tuned = algo_tuned.test(testset)

        dump.dump(file_name='SVD_tuned.p',
                  predictions=predictions_tuned,
                  algo=algo_tuned)

        pred_ratings_tuned = np.array([pred.est for pred in predictions_tuned])

        i_max = np.argpartition(pred_ratings_tuned, -20)[-20:]
        i_max = i_max[np.argsort(-pred_ratings_tuned[i_max])]

        iid = iids_to_pred[i_max]

        #top 20 recommended climbs
        self.df_top_climbs_mf = pd.DataFrame(iid, pred_ratings_tuned[i_max])
        self.df_top_climbs_mf = self.df_top_climbs.reset_index()

        self.df_top_climbs_mf.columns = ['predicted rating', 'route id']
Exemple #19
0
def SVD_pp():
    algo = SVDpp()

    # 定义K折交叉验证迭代器,k=3
    kf = KFold(n_splits=3)
    for trainset, testset in kf.split(data):
        # 训练并预测
        algo.fit(trainset)
        predictions = algo.test(testset)
        # 计算RMSE
        accuracy.rmse(predictions, verbose=True)  # verbose 输出当前跌代,默认False

    uid = str(196)
    iid = str(302)
    # 输出uid对iid的预测结果
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)

    time2 = time.time()
    print(time2 - time1)
Exemple #20
0
def train_collaborative_explicit(user_id):
    try:
        db = getDb()
        if (db):
            print("### Start training collaborative explicit")
            collaboratives = db.collaboratives
            data = pd.DataFrame(
                list(
                    collaboratives.find({
                        "customer": ObjectId(user_id),
                        "explicit": True
                    })))
            data = data[['userId', 'itemId', 'feedBack']]
            data = data.rename(columns={
                'userId': 'user',
                'itemId': 'item',
                'feedBack': 'rating'
            })
            lower_rating = data['rating'].min()
            upper_rating = data['rating'].max()

            reader = Reader(rating_scale=(lower_rating, upper_rating))

            data = Dataset.load_from_df(data[["user", "item", "rating"]],
                                        reader)
            svdpp = SVDpp(verbose=True, n_epochs=5)
            svdpp.fit(data.build_full_trainset())
            file_name = os.path.expanduser('models/' + user_id +
                                           '_collaborative_explicit')
            dump.dump(file_name, algo=svdpp)

            # pickle.dump(matrix, open(DUMPED_MODEL + user_id + "_content.pickle", "wb"))
            print("### Training collaborative explicit complete")
            channel.basic_publish(
                '', STATUS_QUEUE,
                'complete|' + user_id + '|collaborative_explicit')
            print(" [x] Sent to {0}: complete_{1}".format(
                STATUS_QUEUE, user_id))
        else:
            raise Exception("Database not found")
    except:
        raise Exception("Unable to connect mongo")
Exemple #21
0
def surprise_SVDpp(train_file, test_file):
    """
    Svd++ with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd++  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_'x': The learning rate for 'x'
        reg_'x' : The regularization term for 'x'
    'x':
        bi : The item biases
        bu : The user biases
        qi : The item factors
        yj : The (implicit) item factors
        pu : The user factors


    Returns:
        numpy array: predictions
    """
    print("SVDpp")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm

    algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
    def evaluate_on_test(self, train_set, test_set):
        """
        Evaluate the algorithm on the test set after running it on the test set
        :param train_set:
        :param test_set:
        :return: RMSE value on test set
        """
        if train_set is not None and test_set is not None:
            print("Evaluate RMSE on test data")
            self.LOG_HANDLE.info("Evaluate RMSE on test data")

            # http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp
            algo = SVDpp()

            # Train the algorithm on the trainset, and predict ratings for the testset
            algo.fit(train_set)
            predictions = algo.test(test_set)

            # Then compute RMSE
            return accuracy.rmse(predictions)
    def train_best_model_generate_ratings_test(self, ratings_set, test_set):
        """
        Train the best model (with minimum AMSE) on the complete ratings set and then compute the ratings for the test set
        :param ratings_set: The complete ratings data set
        :param test_set: The streams for the users for which ratings are not yet available
        :return: A data frame of the form user, stream, predicted rating
        """
        if ratings_set and test_set:
            print(
                "Training the best model and generating the ratings for the test data set"
            )
            self.LOG_HANDLE.info(
                "Training the best model and generating the ratings for the test data set"
            )

            algo = SVDpp(**model_params.svdpp_best_params)
            algo.fit(ratings_set)

            predictions = algo.test(test_set)
            return predictions
def svdpp(train, test, ids, Xtest, Xids):
    """
    Extension of svd taking the implicit ratings into account
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('SVD++')
    algo = SVDpp(n_factors=100,
                 n_epochs=10,
                 lr_all=0.0015,
                 reg_all=0.05,
                 random_state=15)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Exemple #25
0
class SurpriseRecommender(Recommender):
    name = 'surprise-svdpp'

    def train(self, data):
        ratings_dict = {'itemID': data[:,1],
                        'userID': data[:,0],
                        'rating': data[:,2]}
        df = pd.DataFrame(ratings_dict)

        reader = Reader(rating_scale=(0, 1))

        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset()
        # self.algo = KNNBasic(verbose=False)
        self.algo = SVDpp(verbose=True)
        self.algo.fit(data)

    def rate(self, user, movie):
        return self.algo.test([[user, movie, 0]])[0].est

    def rate_bool(self, user, movie):
        return self.rate(user, movie) > 0.5
Exemple #26
0
def SVDPPThreadFuc(Frame):
    OutUserList = []
    OutFundList = []
    Frame = Frame.loc[:, 'User':'******']
    UserList = Frame.User.unique()
    FundList = Frame.FundCode.unique()
    reader = Reader(rating_scale=(0, 2))
    data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset()
    model = SVDpp()
    model.fit(data)
    for User in UserList:
        UserPointList = []
        for Fund in FundList:
            UserPointList.append(model.predict(User, Fund).est)
        RecommendList = np.argsort(UserPointList)[::-1][0:10]
        for FundIndex in RecommendList:
            OutUserList.append(User)
            OutFundList.append(FundList[FundIndex])
    OutFrame = pd.DataFrame({
        "User": OutUserList,
        "RecommendFundCode": OutFundList
    })
    return OutFrame
Exemple #27
0
def main():
    print('Reading in and chopping data...')
    df = pd.read_csv(rus_master_loc)
    df_chopped = ah.rus_chop(df, 5, 10)
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = Dataset.load_from_df(df_chopped[['user', 'route', 'num_stars']],
                                reader=reader)

    print('Training on full data...')
    trainset = data.build_full_trainset()
    algo = SVDpp(n_epochs=100, reg_all=.1, lr_all=.006)
    algo.fit(trainset)

    print('Writing relevant route_ids...')
    algo_rids = pd.DataFrame(df_chopped.groupby('route').count().index)
    algo_rids.to_csv(algo_rids_write_loc, index=None)

    print('Pickling algo...')
    pickle.dump(algo, open(pickle_write_loc, 'wb'))

    print('Done, pickled algo is here: {}'.format(pickle_write_loc))
    print(
        'Route IDs contained in algo are here: {}'.format(algo_rids_write_loc))
Exemple #28
0
def svd_model(df):
    """ Apply SVD.
    """
    df = pd.melt(df,
                 id_vars='smiles',
                 value_vars=list(df.columns[1:]),
                 var_name='Target',
                 value_name='TargetValue')

    mark = df.TargetValue.isna()
    unknown = df.loc[mark]
    known = df.loc[~mark]

    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(known[['smiles', 'Target', 'TargetValue']],
                                reader)

    kf = KFold(n_splits=3, random_state=57)

    algo = SVDpp(n_factors=12, reg_all=0.003, lr_all=0.006, random_state=132)

    for trainset, testset in kf.split(data):

        algo.fit(trainset)
        predictions = algo.test(testset)

        rmse = round(accuracy.rmse(predictions, verbose=True), 3)

        print('RMSE of SVD model for cross validation' + str(rmse))

    result = unknown.copy()
    result['ToxicProb'] = result.apply(
        lambda x: algo.predict(x.smiles, x.Target).est, axis=1)
    result = result.drop(columns='TargetValue')

    return result
class RecommenderSVDppSimilarUsers(Recommender):
    """ 
        Instead of building new dataset when the new user is in, we get similar users,
        and based on that try to get similar movies
    """
    def __init__(self, movies):
        super(RecommenderSVDppSimilarUsers, self).__init__(movies)
        self.algorithm = SVDpp()

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20, k_inner_item=10):
        # get dataset
        full_dataset = self.algorithm.trainset

        # watched movies
        watched = {
            full_dataset.to_inner_iid(key): value
            for key, value in watched.items()
        }

        # get similar users
        similar_users = self.get_similar_user_ids(watched, k=k_inner_item)

        # Calculate for all similar user, predictions
        candidates = defaultdict(float)
        for inner_move_id in range(0, full_dataset.n_items):
            if inner_move_id not in watched:
                movie_id = full_dataset.to_raw_iid(inner_move_id)
                for inner_user_id, similarity in similar_users.items():
                    prediction = self.algorithm.predict(
                        full_dataset.to_raw_uid(inner_user_id), movie_id)
                    candidates[movie_id] += similarity * prediction.est

        # heapq.nlargest(k, candidates.items(), key=itemgetter(1))
        return self.movies.get_movie_by_movie_ids(
            heapq.nlargest(k, candidates, key=candidates.get))
Exemple #30
0
 def SVDpp(self, n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02):
     """
     An extension of Singular Value Decomposition algorithm that takes
     implicit ratings into account.
     Args:
         n_factors: Number of latent features, factors
         n_epochs: Number of iterations of the optimization loop
         lr_all: The learning rate for all parameters
         reg_all: The regularization term for all parameters
     Returns:
         predictions_df: The predictions of the model on the test data in
             Pandas Data Frame format
     """
     algorithm = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, 
                       reg_all=reg_all)
     predictions = algorithm.fit(self.train_data).test(self.test_data)
     predictions_df = self.data.test_df.copy()
     predictions_df['Rating'] = [x.est for x in predictions]
     if self.test_purpose: 
         self.evalueate_model(predictions_df['Rating'], 'Surprise SVDpp')
     return predictions_df