def user_factorization(data_raw, user_clusters, params):
    n_factors = params["LOCAL_U_NMF_K"]
    user_df = pd.DataFrame()
    for i in range(user_clusters):
        u_i = data_raw[data_raw["user cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            u_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_U_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        user_df = pd.concat([user_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    all_u_m = get_all_u_m()
    user_df = all_u_m.merge(user_df, how="left", on=["uid", "iid"])
    user_df = user_df[["uid", "iid", "est"]]
    logging.info("return from user_factorization")
    return user_df
 def build_trainset(self):
     '''
     Build the trainset from ratings_df to be used by the <surprise.prediction_algorithms.algo_base.AlgoBase>.fit()
     '''
     reader = surprise.Reader(rating_scale=(1, 10))
     data = surprise.Dataset.load_from_df(self.ratings_df[['user_id', 'isbn', 'rating']], reader)
     self.trainset = data.build_full_trainset()
def item_factorization(data_raw, item_clusters, user_df, params):
    n_factors = params["LOCAL_I_NMF_K"]
    item_df = pd.DataFrame()
    for i in range(item_clusters):
        i_i = data_raw[data_raw["item cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            i_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_I_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        #i_i.rename(columns={"User":"******","Movie":"iid","Prediction":"est"},inplace=True)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        item_df = pd.concat([item_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    item_df = user_df[["uid", "iid"]].merge(item_df,
                                            how="left",
                                            on=["uid", "iid"])
    item_df["est"].loc[item_df["est"].isnull()] = 0
    logging.info("return from item_factorization")
    return item_df
Beispiel #4
0
    def get_surprise_format_data(self):
        # write dataset to a temp file
        delim = ";"
        write_str = ""
        for rating_tuple in tqdm(self.data):
            write_str += str(int(rating_tuple[0])) + delim
            write_str += str(int(rating_tuple[1])) + delim
            write_str += str(rating_tuple[2]) + '\n'

        f = open("temp_file.txt", "w")
        f.write(write_str)
        f.close()

        rating_scale = (1, 5)
        if self.hyper_params['dataset'] == 'ratebeer': rating_scale = (1, 20)

        reader = surprise.Reader(line_format="user item rating",
                                 sep=delim,
                                 rating_scale=rating_scale)
        dataset = surprise.Dataset.load_from_file(file_path="temp_file.txt",
                                                  reader=reader)
        trainset = dataset.build_full_trainset()

        # Delete temp file
        os.remove("temp_file.txt")

        return trainset
def test_predict(rating_true):
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()
    ).build_full_trainset()
    svd.fit(train_set)

    preds = predict(svd, rating_true)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[0]["userID"]
    item = rating_true.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][
        "prediction"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)

    preds = predict(
        svd,
        rating_true.rename(columns={"userID": "uid", "itemID": "iid"}),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[1]["userID"]
    item = rating_true.iloc[1]["itemID"]
    assert preds[(preds["uid"] == user) & (preds["iid"] == item)][
        "pred"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
Beispiel #6
0
def get_sim_matrix(load_sim_matrix):
    raw = pd.read_csv('books_metadata/ratings.csv')
    if load_sim_matrix is False:
        raw = pd.read_csv('books_metadata/ratings.csv')
        raw = raw[raw['book_id'] <= 1900]
        raw.drop_duplicates(inplace=True)
        print('we have', str(raw.shape[0]), 'ratings')
        print('the number of unique users we have is:',
              len(raw.user_id.unique()))
        print('the number of unique books we have is:',
              len(raw.book_id.unique()))

        rawTrain = raw[['user_id', 'book_id', 'rating']]
        rawTrain, rawHoldout = train_test_split(raw, test_size=0.25)
        reader = surprise.Reader(rating_scale=(1, 5))
        data = surprise.Dataset.load_from_df(rawTrain, reader)

        sim_options = {'name': 'cosine', 'user_based': False}
        collabKNN = surprise.KNNWithMeans(k=100, sim_options=sim_options)
        kSplit = surprise.model_selection.split.KFold(n_splits=2,
                                                      shuffle=False)
        for trainset, testset in kSplit.split(data):
            collabKNN.fit(trainset)
            predictionsKNN = collabKNN.test(testset)
            surprise.accuracy.rmse(predictionsKNN, verbose=True)

        sim_matrix = collabKNN.compute_similarities()
        with open('books_sim_matrix', 'wb') as output:
            pickle.dump(sim_matrix, output, protocol=pickle.HIGHEST_PROTOCOL)
        return sim_matrix

    with open('books_sim_matrix', 'rb') as input:
        sim_matrix = pickle.load(input)
        return sim_matrix
Beispiel #7
0
def learn(id):
    print(id)
    dataset = getData()
    # 데이터 셋을 만든다.
    df = pd.DataFrame(dataset)
    # 데이터를 읽어와 surprise에서 사용하는 데이터 형태로
    # 만들어주는 객체, rating_sacle=(최소, 최대) <-평점기준
    reader = sp.Reader(rating_scale=(0.0, 5))
    # 딕셔너리에 담겨있는 데이터의 이름
    # 데이터셋을 만들 때 첫번재 이름이 사용자 구분값, 두번째
    # 이름이 상품 구분값, 세번째 이름이 평점으로 인식하여
    # 데이터를 읽어들이고 데이터셋으로 만든다.
    col_list = ['user_id', 'wine_id', 'points']

    data = sp.Dataset.load_from_df(df[col_list], reader)

    # 학습할 모델
    model = sp.KNNBasic(sim_options={'name': 'pearson'})
    # 학습한다.
    trainset = data.build_full_trainset()
    model.fit(trainset)
    result = model.get_neighbors(id, k=5)
    print(result)
    rec_list = list()
    for r in result:
        rec_list.append(str(dataset['wine_id'][r]))
        print(dataset['wine_id'][r])
    winelist = ','.join(rec_list)
    return winelist
Beispiel #8
0
def makeCustomDataFile(data_file):

    # combined_data_1.txt에는 4499개의 movieID가 저장되어있음.
    custom_data_file = open("/Users/limjungmin/Netflix_Recommender/u.data", 'w')

    #cnt = 0 : 디버깅용 Count 계수
    for line in data_file:

        if ":" in line:
            movieID = line.split(":")[0]
            #print(movieID)
            #cnt+=1
        else :

            info = line.split(",")

            userID = info[0]
            rating = info[1]
            date = info[2].split('\n')[0]

            str = userID + ";" + movieID + ";" + rating + "\r\n"
            custom_data_file.write(str)

            #if cnt > 50 : break

    print("make Custom Data File Done")

    reader = surprise.Reader(line_format='user item rating', sep=';')
    data = surprise.Dataset.load_from_file('/Users/limjungmin/Netflix_Recommender/u.data', reader=reader)
    df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
    del df["id"]

    print(df.head(10))

    return data
Beispiel #9
0
    def fit(self, X, y):
        """Fit the surprise model to data.
        
        Parameters
        ----------
        X : pandas DataFrame
            Table containing user IDs and item IDs.
        y : pandas Series
            Scores.
            
        Returns
        -------
        self
            The fit estimator.
        """

        # Create a pandas DataFrame in suprise format
        X_train = X[[self._user_col, self._item_col]]
        X_train['score'] = y

        # Compute the score range
        if self._score_range[0] is None:
            self._score_range[0] = y.min()
        if self._score_range[1] is None:
            self._score_range[1] = y.max()

        # Create a suprise Dataset from the dataframe
        reader = surprise.Reader(rating_scale=self._score_range)
        dataset = (surprise.dataset.Dataset.load_from_df(X_train, reader))
        dataset = dataset.build_full_trainset()

        # Fit the model
        self._model = self._surprise_model(**self._kwargs)
        self._model.fit(dataset)
        return self
Beispiel #10
0
    def __init__(self, df_rating, test_ratio=None, df_id_name_table=None, rating_scale=(1, 5)):
        """
        Initialize collaborative filtering data class
        :param df_rating: pandas dataframe containing columns: 'userID', 'itemID', 'rating' in correct order
        :param df_id_name_table: table to convert itemID to readable item name (like movie title). 
                                 dataframe containg columns: 'itemID' and 'itemName' in correct order
        :return: None
        Eg: 
            df_id_name_table = df_movie[['movieId', 'title']].\
                rename(index=str, columns={'movieID':'itemID', 'title':'itemName'})
            cfdata_example = CFData(data, df_id_name_table)
            cfdata_example.convert_name_to_id('Toy Story (1995)')
            cfdata_example.convert_id_to_name(1)
        """      
        reader = surp.Reader(rating_scale=rating_scale)
        rating_data = surp.Dataset.load_from_df(df_rating, reader)
        self.trainset = rating_data.build_full_trainset()
  
        if test_ratio is not None:
            self.trainset, self.testset = surp.model_selection.train_test_split(data=rating_data, test_size=test_ratio)
        else:
            self.trainset = rating_data.build_full_trainset()

        # self.__dict_id_to_name: id_1: [name1_1, name1_2...], id_2: [name2_1, name2_2....]
        # self.__dict_name_to_id: name1: [id1_1, id1_2...], name2: [id2_1, id2_2...]
        if df_id_name_table is not None:
            self.__dict_id_to_name = df_id_name_table.groupby('itemID')['itemName'].apply(lambda x: x.tolist()).to_dict()
            self.__dict_name_to_id = df_id_name_table.groupby('itemName')['itemID'].apply(lambda x: x.tolist()).to_dict()
Beispiel #11
0
def test_compute_rating_predictions(python_data):
    rating_true, _, _ = python_data(binary_rating=False)
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_rating_predictions(svd, rating_true)
    assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
    assert preds['userID'].dtypes == rating_true['userID'].dtypes
    assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
    user = rating_true.iloc[0]['userID']
    item = rating_true.iloc[0]['itemID']
    assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)

    preds = compute_rating_predictions(svd,
                                       rating_true.rename(columns={
                                           'userID': 'uid',
                                           'itemID': 'iid'
                                       }),
                                       usercol='uid',
                                       itemcol='iid',
                                       predcol='pred')
    assert set(preds.columns) == {'uid', 'iid', 'pred'}
    assert preds['uid'].dtypes == rating_true['userID'].dtypes
    assert preds['iid'].dtypes == rating_true['itemID'].dtypes
    user = rating_true.iloc[1]['userID']
    item = rating_true.iloc[1]['itemID']
    assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
Beispiel #12
0
    def tune(self, train_class, **options):
        """Discover the optimal values for the parameters to tune, using the indicated surprise library train class.
        Uses the surprise library grid search cross-validate (surprise.model_selection.GridSearchCV) class.

        :param train_class: An instance of the surprise.prediction_algorithms.algo_base.AlgoBase class.
        :param options: Any additional options (key-word arguments) to be passed to the grid search algorithm.
        :return:
        """
        self._print_params()

        df = helpers.read_to_df(self.input_path)
        ratings = surprise.Dataset.load_from_df(df, surprise.Reader())

        grid_search = surprise.model_selection.search.GridSearchCV(
            algo_class=train_class,
            param_grid={
                **self.params,
                **options
            },
            measures=['rmse'],
            n_jobs=-1,  # enable parallel execution
        )
        grid_search.fit(ratings)

        print('RMSE:', grid_search.best_score['rmse'])
        print('Results:')
        for param, val in grid_search.best_params['rmse'].items():
            print('  - {param}: {val}'.format(param=param, val=val))
def train(ratings, k_neighbors, k_folds):
    """
    Train a model and return it. Then we can use the model and evaluate it elsewhere
    @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings
    @param k_neighbors number of neighbors to examine
    @param k_folds number of folds for cross validation
    @returns List of (algo, test data)
    We can call methods such as `test` and `evaluate` on this object 
    """

    train_data, test_data = cv.train_test_split(ratings, test_size=0.20)
    reader = sp.Reader(rating_scale=(1, 5))

    trainset = sp.Dataset.load_from_df(train_data, reader)
    testset = sp.Dataset.load_from_df(test_data, reader)

    trainset.split(n_folds=k_folds)

    similarity_options = {'name': 'pearson', 'user_based': False}
    algo = sp.KNNWithMeans(sim_options=similarity_options,
                           k=k_neighbors,
                           min_k=5)

    for _trainset, _ in trainset.folds():
        algo.train(_trainset)

    testset = testset.build_full_trainset().build_testset()
    return (algo, testset)
def step5_surprise():
    name_list, movie_list, rating_dic = data_to_dic()
    # print(rating_dic)
    # 데이터 셋을 만든다.
    df = pd.DataFrame(rating_dic)
    # rating_scale : 데이터에 담긴 평점의 범위
    reader = surprise.Reader(rating_scale=(0.0, 5.0))
    
    # 딕셔너리에 담겨져 있는 리스트의 이름
    col_list = ['user_id', 'item_id', 'rating']
    data = surprise.Dataset.load_from_df(df[col_list], reader)

    trainset = data.build_full_trainset()

    # 학습한다.
    # 유사도 계산 방식을 설정
    # option1 = {'name': 'msd'}
    # option2 = {'name': 'cosine'}
    option3 = {'name': 'pearson'}

    # 추천 목록을 만들기 위한 객체 생성
    algo = surprise.KNNBasic(sim_options=option3)
    algo.fit(trainset)

    # 소이현에 대해 영화를 추천받는다.
    index = name_list.index('소이현')
    result = algo.get_neighbors(index, k=3)

    for r1 in result:
        # r1이 1부터 시작해서 -1을 해준다.
        print(movie_list[r1-1])
def train_model(M, N, K, eta, reg, Y, eps=0.0001, max_epochs=300):
    """
    Given a training data matrix Y containing rows (i, j, Y_ij)
    where Y_ij is user i's rating on movie j, learns an
    M x K matrix U and N x K matrix V such that rating Y_ij is approximated
    by (UV^T)_ij.

    Uses a learning rate of <eta> and regularization of <reg>. Stops after
    <max_epochs> epochs, or once the magnitude of the decrease in regularized
    MSE between epochs is smaller than a fraction <eps> of the decrease in
    MSE after the first epoch.

    Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE
    of the model.
    """
    df = pd.DataFrame(Y)
    df = df.sort_values(1)
    model = surprise.SVD()
    reader = surprise.Reader(rating_scale=(1, 5))
    data = surprise.Dataset.load_from_df(df[[0, 1, 2]], reader)

    trainset = data.build_full_trainset()

    model.fit(trainset)

    return (model, get_err(model, Y), trainset)
Beispiel #16
0
 def __init__(self):
     module_dir = inspect.getfile(inspect.currentframe())
     self.base_dir = "/".join(module_dir.split('/')[:-2])
     vote2_dir = self.base_dir + '/data/votes2'
     df = pd.read_csv(vote2_dir,
                      sep=' ',
                      names=['VN_id', 'user_id', 'vote', 'date'])
     self.user_ser = (df.groupby('user_id').count()[['VN_id']]['VN_id'] >
                      15)
     self.ser = (df.groupby('VN_id').count()[['user_id']]['user_id'] > 20)
     # grab users with >15 votes and games with >20 votes
     self.high_user_votes_df = df[df['VN_id'].isin(
         self.ser.keys()[self.ser])][df[df['VN_id'].isin(
             self.ser.keys()[self.ser])]['user_id'].isin(
                 self.user_ser.keys()[self.user_ser])]
     # the problem with this is that it ends up taking around 50MB of memory, which isn't too much but it's not sustainable for multiple instances.
     self.ccmodel = surprise.prediction_algorithms.co_clustering.CoClustering(
         n_cltr_u=3, n_cltr_i=5, n_epochs=5, verbose=True)
     # coclustering was chosen because of the tradeoffs of speed and useful predictions it could bring.
     # the clusters above were determined through cross-validation
     self.reader = surprise.Reader(rating_scale=(10, 100))
     #change this in future, deprecated
     data = surprise.dataset.Dataset.load_from_df(
         self.high_user_votes_df[['user_id', 'VN_id', 'vote']], self.reader)
     self.trainset = data.build_full_trainset()
     self.vnc = VndbConnection()
Beispiel #17
0
def get_matrix_factorization(ratings, meta_data, n_user, n_movies):
    # Matrix Faktorization
    algo = surprise.SVD(n_factors=50, biased=False)
    reader = surprise.Reader(rating_scale=(0.5, 5))
    surprise_data = surprise.Dataset.load_from_df(
        ratings[["userId", "movieId", "rating"]],
        reader).build_full_trainset()
    algo.fit(surprise_data)

    pred = algo.test(surprise_data.build_testset())
    print("MSE: ", surprise.accuracy.mse(pred))
    print("RMSE: ", surprise.accuracy.rmse(pred))

    ranking_matrix = np.dot(algo.pu, algo.qi.T)
    # ranking_matrix = np.clip(ranking_matrix, 0.5, 5)

    # movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in movies_to_pick]
    movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in range(n_movies)]
    features_matrix_factorization = algo.pu
    print("Means: ", np.mean(features_matrix_factorization),
          np.mean(algo.qi.T))
    print("Feature STD:", np.std(features_matrix_factorization),
          np.std(algo.qi))
    print("Full Matrix Shape", np.shape(ranking_matrix), "rankinG_shape",
          np.shape(ranking_matrix))

    return ranking_matrix, features_matrix_factorization, movie_idx_to_id
Beispiel #18
0
def getUserBaseData(user, addr, raw, count):
    # when importing from a DF, you only need to specify the scale of the ratings.
    reader = surprise.Reader(rating_scale=(1, 4))
    #into surprise:
    dataframe = surprise.Dataset.load_from_df(raw, reader)
    trainset = dataframe.build_full_trainset()

    algo = surprise.SVD()
    algo.fit(trainset)

    iids = raw['around'].unique()
    iidsUsrnotVisited = raw.loc[raw['user'] == user, 'around']
    iids_to_pred = np.setdiff1d(iids, iidsUsrnotVisited)  # 안 간 가게 구함(차집합)
    # user_id가 가지않은 가게들로 testset 생성
    testset = [[user, iid, 4.] for iid in iids_to_pred]
    predictions = algo.test(testset)
    # print(surprise.accuracy.rmse(predictions))
    pred_ratings = np.array([pred.est for pred in predictions])
    # print(len(pred_ratings))
    if len(pred_ratings) < count:
        i_max = pred_ratings.argsort()[::-1]
    else:
        i_max = pred_ratings.argsort()[::-1][:count]
    #i_max = pred_ratings.argmax()
    iid = iids_to_pred[i_max]
    results = {}
    results_ids = []
    for i, m in zip(iid, i_max):
        # print('{0} : {1}'.format(i,pred_ratings[m]))
        results[i] = pred_ratings[m]
        results_ids.append(i)
    print(results_ids)
    return results_ids
Beispiel #19
0
    def predict(self, X, y=None):
        """Predict the scores using the surprise model.
        
        Parameters
        ----------
        X : pandas DataFrame
            Table containing user IDs and item IDs.
            
        Returns
        -------
        y_pred : pandas Series
            Predicted scores.        
        """

        # Check if model has been fit
        if self._model is None:
            raise RuntimeError('model has not been fit')

        # Create a pandas DataFrame in suprise format
        X_test = X[[self._user_col, self._item_col]]
        X_test['score'] = np.nan

        # Create a suprise Testset from the dataframe
        reader = surprise.Reader(rating_scale=self._score_range)
        testset = (surprise.dataset.Dataset.load_from_df(X_test, reader))
        testset = testset.build_full_trainset().build_testset()

        # Use suprise model to predict scores
        preds = self._model.test(
            testset)  #returns a list of "Prediction" objs...
        preds = [pred[3] for pred in preds]
        return pd.Series(data=np.array(preds), index=X.index)
def train(args):
    """Training script taking parsed command line / SageMaker variable arguments
    """
    input_files = [
        os.path.join(args.train, file) for file in os.listdir(args.train)
    ]
    if len(input_files) == 0:
        raise ValueError((
            "There are no files in {}.\n" +
            "This usually indicates that the channel ({}) was incorrectly specified,\n"
            +
            "the data specification in S3 was incorrectly specified or the role specified\n"
            + "does not have permission to access the data.").format(
                args.train, "train"))
    train_df = pd.concat(
        [pd.read_csv(file, engine="python") for file in input_files])
    train_data = surprise.Dataset.load_from_df(
        train_df,
        surprise.Reader(line_format=u"user item rating", rating_scale=(1, 5)))
    algo = surprise.SVD()

    # Note: Quality metrics like this can be exposed to SageMaker if wanted, see:
    # https://sagemaker.readthedocs.io/en/stable/overview.html#training-metrics
    results = surprise.model_selection.cross_validate(
        algo,
        train_data,
        measures=("RMSE", "MAE"),
        verbose=True,
        cv=args.cross_validation_folds)

    # The main mission of our script is to train a model and then save it to file:
    algo.fit(train_data.build_full_trainset())
    surprise.dump.dump(os.path.join(args.model_dir, ALGO_FILE_NAME), algo=algo)
Beispiel #21
0
def read_data_surprise(df,
                       minstar=1,
                       maxstar=3,
                       col1='user_id',
                       col2='route',
                       col3='rating'):
    '''
    Produces a surpise library data object from original dataframe

    ---Parameters---

    df (Pandas DataFrame)
    minstar (int) minimum star possible in dataset (default set to 1)
    maxstar (int) maximum star possible in dataset (default set to 3)
    col1 (string) column name that MUST correspond the the users in the df
    col2 (string) column name that MUST corresponds the the items in the df
    col3 (string) column name that corresponds the the ratings of the items in the df

    ---Returns---
    surprise library data object to manipulate later

    '''
    # need to specify the rating_scale of stars (default 1-3 stars)
    reader = sp.Reader(rating_scale=(minstar, maxstar))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = sp.Dataset.load_from_df(df[[col1, col2, col3]], reader)

    return data
Beispiel #22
0
def do_nmf(data_raw, impute_params):
    data = data_raw.pivot(index="User", columns="Movie",
                          values="Prediction").to_numpy()
    reader = surprise.Reader(rating_scale=(1, 5))
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()

    algo = NMF(n_factors=impute_params["FACTORS"],
               n_epochs=impute_params["EPOCHS"],
               verbose=True)
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    predictions = pd.DataFrame(predictions)

    predictions.rename(columns={
        "uid": "User",
        "iid": "Movie",
        "est": "Prediction"
    },
                       inplace=True)
    predictions = predictions[["User", "Movie", "Prediction"]]

    data = pd.concat([data_raw, predictions], ignore_index=True)
    data = data.pivot(index="User", columns="Movie",
                      values="Prediction").to_numpy()
    return data
Beispiel #23
0
    def infer_votes(self, round=False, type='nmf', n_factors=8, **kwargs):
        """
        Return a list of inferred votes for missing data.
        """
        import surprise

        algo_class = {
            'knn': surprise.KNNBasic,
            'knn-means': surprise.KNNWithMeans,
            'knn-baseline': surprise.KNNBaseline,
            'knn-zscore': surprise.KNNWithZScore,
            'nmf': surprise.NMF,
            'svd': surprise.SVD,
            'svd++': surprise.SVDpp,
            'baseline': surprise.BaselineOnly,
        }[type]

        reader = surprise.Reader(rating_scale=(-1, 1))
        data = surprise.Dataset.load_from_df(self.votes, reader)
        train = data.build_full_trainset()

        kwargs.setdefault('verbose', False)
        algo = algo_class(n_factors=n_factors, **kwargs)
        algo.fit(train)

        predictions = []
        for uid, iid in missing_votes(self.pivot_table):
            prediction = algo.predict(uid, iid)
            predictions.append((uid, iid, prediction.est))

        if round:
            np.round(predictions)

        return pd.DataFrame(predictions, columns=['user', 'comment', 'vote'])
def svd_surprise(k=20, epochs=20, learning_rate=0.005, bias=True, test_fraction=0.0):
    """
    Performs SVD on the ratings data using surprise.

    """
    # Load the data
    reader = surprise.Reader(rating_scale=(1, 5), sep='\t')
    data = surprise.Dataset.load_from_file('data/data.txt', reader)

    if test_fraction == 0.0:
        _, test_set = surprise.model_selection.train_test_split(data, test_size=0.25)
        train_set = data.build_full_trainset()
    else:
        # Split the data into a training set and test set
        train_set, test_set = surprise.model_selection.train_test_split(data, test_size=test_fraction)

    # Declare the model
    model = surprise.SVD(n_factors=k, n_epochs=epochs, lr_all=learning_rate, biased=bias)

    # Train the model on the data
    model.fit(train_set)
    predictions = model.test(test_set)

    # Print the accuracy of the predictions
    print("SVD Test RMSE: " + str(surprise.accuracy.rmse(predictions, verbose=False)))

    # Return U, V, the user bias terms, and the movie bias terms
    return model.pu, model.qi, model.bu, model.bi
Beispiel #25
0
def train(where, k):
    # df_to_dict = recur_dictify(pd.read_pickle('../../../data/over_10review_stores.pkl'))
    # store_list = []  # 사용자 목록을 담을 리스트
    # user_set = set()  # 음식점 목록을 담을 set
    #
    # # store 수 만큼 반복
    # for store_key in df_to_dict:
    #     store_list.append(store_key)
    #
    #     for user_key in df_to_dict[store_key]:
    #         user_set.add(user_key)
    #
    # user_list = list(user_set)

    df = pd.read_pickle("../../data/dic_to_train_stores.pkl")
    reader = surprise.Reader(rating_scale=(1, 5))

    col_list = ['store_id', 'user_id', 'score']
    data = surprise.Dataset.load_from_df(df[col_list], reader)
    # Train
    trainset = data.build_full_trainset()
    option = {'name': 'pearson'}
    algo = surprise.KNNBasic(sim_options=option)

    algo.fit(trainset)

    # 사용자의 음식점을 추천한다.
    # where = input('store id : ')
    print("\n")

    user_list = pd.read_pickle(
        "../../data/Item_based_user_list.pkl")[0].tolist()
    store_list = pd.read_pickle(
        "../../data/Item_based_store_list.pkl")[0].tolist()
    # user_list = dff.user.unique().tolist()
    # store_list = dff.store.unique().tolist()

    index = store_list.index(int(where))
    print('store_idx : ', index)
    print("\n")

    result = algo.get_neighbors(index, k=k)  # k=10
    print(where, "와 유사한 음식점은?")
    print(result)
    print("\n")

    # 음식점에 대한 유저를 추천한다.
    print(where, "를 평가한 당신에게 추천하는 친구:", "\n")
    recommend_user_list = []
    for r1 in result:
        max_rating = data.df[data.df["store_id"] == r1]["score"].max()
        user_id = data.df[(data.df["score"] == max_rating)
                          & (data.df["store_id"] == r1)]["user_id"].values

        for user in user_id:
            recommend_user_list.append(user_list[user])
            # print(user_list[user])
    return recommend_user_list
def transform_surprise(data):
    """
    Transform the data into a Surprise format.
    @param data: the data in a Pandas format.
    @return: the data in a Surprise format.
    """
    reader = spr.Reader(rating_scale=(1, 5))
    data_surprise = spr.Dataset.load_from_df(data[['User', 'Movie', 'Rating']], reader)
    return data_surprise
    def surprise_fit(self):
        reader = surprise.Reader(rating_scale=(0, 10))
        data = surprise.Dataset.load_from_df(
            self.df[['userID', 'itemID', 'rating']], reader)
        trainset = data.build_full_trainset()

        start = time.time()
        self.algo.fit(trainset)
        end = time.time()
Beispiel #28
0
 def train_predict(self):
     self.algo = surprise.prediction_algorithms.SVD()
     # Train
     reader = surprise.Reader(rating_scale=self.dataset.rating_scale)
     data = surprise.Dataset.load_from_df(self.dataset.surprise_ratings_df[["userID", "itemID", "rating"]], reader)
     surprise.model_selection.cross_validate(self.algo, data, measures=["RMSE"], cv=5, verbose=True)
     
     # Predict user ratings
     self.compute_user_predictions()
Beispiel #29
0
def pandas_to_surprise(data, pred=False):
    """Function that converts pandas dataframe into surprise data
    or directly the testset if the submission sample is given and pred set to True"""
    reader = spr.Reader(rating_scale=(1, 5))
    data_spr = spr.Dataset.load_from_df(data[['User', 'Movie', 'Rating']], reader)
    if pred:
        data_spr = data_spr.build_full_trainset().build_testset()
        data_spr = sorted(data_spr, key=lambda x: (x[1], x[0]))
    return data_spr
Beispiel #30
0
def convert_to_df(ds, rating_range):
    ratings_dict = {
        "uid": list(ds[:, 0].astype(np.int)),
        "vid": list(ds[:, 1].astype(np.int)),
        "r": list(ds[:, 2])
    }
    reader = sp.Reader(rating_scale=rating_range)
    df = pd.DataFrame(ratings_dict)
    return sp.Dataset.load_from_df(df[['uid', 'vid', 'r']], reader)