def svd_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    pmf_ratings Predicates
    """
    print("SVD predicates")
    svd_model = SVD()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    svd_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est

    write(predictions, 'svd_rating_obs', fold, phase)
Exemple #2
0
    def estimate_preference(self, user_id, item_id):

        """
        Estimate the preference value by a specific user.
        :param user_id: Id of the user to recommend.
        :param item_id: Id of the item to recommend.
        :return: The estimate preference by the sepecific recommender.
        """

        # train file:
        df_ratings = self.rating_data_model.df_ratings
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference()))
        train_data = Dataset(reader=reader)
        # The columns must correspond to user id, item id and ratings (in that order).
        raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader)
        trainset = train_data.construct_trainset(raw_trainset.raw_ratings)

        # Train recommendation input_model:
        self.model.fit(trainset)

        return float(self.model.estimate(u=user_id, i=item_id)[0])
Exemple #3
0
mtarix_toGO['Norm_Tot_Amnt']= (mtarix_toGO['Mean_amount'] -min_amt)/max_amt
#lower_bound = min(mtarix_toGO['Log_Mean_Amount'])
#upper_bound = max(mtarix_toGO['Log_Mean_Amount'])
#print lower_bound
#print upper_bound
# Remove the outliers
dfx=mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4]
lower_bound = min(dfx['Norm_Tot_Amnt'])
upper_bound = max(dfx['Norm_Tot_Amnt'])
print 'Lower Bound normalized spending =',lower_bound
print 'Upper Bound normalized spending =',upper_bound
print 'Number of Transactions remaining after removing Outliers::',mtarix_toGO.shape[0]

#define the reader  with  upper and lower bounds , also now we are predicting Normalized Total Amount column
reader_x = Reader(rating_scale = (lower_bound,upper_bound))
data = Dataset.load_from_df(df=dfx[['CustomerID','StockCode','Norm_Tot_Amnt']],reader=reader_x)


#for i in range(9):
#    print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0])

print 'difference in processed and pre-processed dataset = ',(data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0])

import time
start_time = time.time()


#param_grid = {'n_factors':[2,5,10,50],'n_epochs': [10,50,100], 'lr_bu': [0.1,0.01,0.001,0.0001],'lr_bi': [0.1,0.01,0.001,0.0001],'reg_bi': [0.1,0.01,0.001,0.0001],'reg_bu': [0.1,0.01,0.001,0.0001],'reg_qi': [0.1,0.01,0.001,0.0001],'reg_pu': [0.1,0.01,0.001,0.0001]}
param_grid = {'n_factors':[5,10,50,100],'n_epochs': [5,10,20,50,100], 'lr_all': [0.1,0.01,0.001],'reg_all': [0.1,0.01,0.001}
grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=1)
Exemple #4
0
del df4['geo_point_2d']

df_f = df4.join(df5)

df_i = df_f.loc[df_f["ARRONDISSEMENT"].str.contains('paris') == True]

df_i = df_i.drop(204726)

df_i['ARRONDISSEMENT'] = encoder.fit_transform(df_i['ARRONDISSEMENT'])
df_i['LIEU/ADRESSE'] = encoder.fit_transform(df_i['LIEU/ADRESSE'])
df_i['STADE'] = encoder.fit_transform(df_i['STADE'])
df_a = df_i.loc[df_i["ALLERGIE"] == 1]

reader = Reader(rating_scale=(1, 164151))
df_etude_2 = Dataset.load_from_df(df_a[['LATITUDE', 'LONGITUDE', 'GENRE']],
                                  reader)

X = StandardScaler().fit_transform(df_a)
algo5 = DBSCAN(eps=0.3, min_samples=7).fit(X)
labels = algo5.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

train_2, test_2 = train_test_split(df_etude_2, test_size=.25)
algo = SVD()
predictions_2 = algo.fit(train_2).test(test_2)

lat = []
lng = []

for i in predictions_2:
Exemple #5
0
                           head(10)

#%% Most active users -- Check correlation of numbers with rating/time?
ratings.\
    groupby('User')['Recipe'].\
           count().\
                sort_values(ascending=False).\
                           head(10)

#%% Distribution of Ratings
print(ratings.Rating.describe())
print(set(ratings.Rating))

#%% Build train - test split
reader = Reader(rating_scale=(1, 5))
data   = Dataset.load_from_df(ratings, reader)

random.seed(42)
random.shuffle(data.raw_ratings)
cut_off = int(len(data.raw_ratings) * 0.75)

train_ratings = data.raw_ratings[:cut_off]
test_ratings  = data.raw_ratings[cut_off:]

data.raw_ratings = train_ratings

#%% Evaluate baseline on all, bias and test error
def evaluator(algo, df, cv_method, verbose = False):
    """
    wrapper to streamline evaluation
    """
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(1, 7))
data = Dataset.load_from_df(df_c1[['Smart Card_', 'Class.1_', 'freq']], reader)

# getting the most effective Algorithm for Recommendation System
benchmark = []
for algorithm in [
        SVD(),
        NMF(),
        SVDpp(),
        SlopeOne(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering()