def svd_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ pmf_ratings Predicates """ print("SVD predicates") svd_model = SVD() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) svd_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est write(predictions, 'svd_rating_obs', fold, phase)
def estimate_preference(self, user_id, item_id): """ Estimate the preference value by a specific user. :param user_id: Id of the user to recommend. :param item_id: Id of the item to recommend. :return: The estimate preference by the sepecific recommender. """ # train file: df_ratings = self.rating_data_model.df_ratings # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference())) train_data = Dataset(reader=reader) # The columns must correspond to user id, item id and ratings (in that order). raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader) trainset = train_data.construct_trainset(raw_trainset.raw_ratings) # Train recommendation input_model: self.model.fit(trainset) return float(self.model.estimate(u=user_id, i=item_id)[0])
mtarix_toGO['Norm_Tot_Amnt']= (mtarix_toGO['Mean_amount'] -min_amt)/max_amt #lower_bound = min(mtarix_toGO['Log_Mean_Amount']) #upper_bound = max(mtarix_toGO['Log_Mean_Amount']) #print lower_bound #print upper_bound # Remove the outliers dfx=mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4] lower_bound = min(dfx['Norm_Tot_Amnt']) upper_bound = max(dfx['Norm_Tot_Amnt']) print 'Lower Bound normalized spending =',lower_bound print 'Upper Bound normalized spending =',upper_bound print 'Number of Transactions remaining after removing Outliers::',mtarix_toGO.shape[0] #define the reader with upper and lower bounds , also now we are predicting Normalized Total Amount column reader_x = Reader(rating_scale = (lower_bound,upper_bound)) data = Dataset.load_from_df(df=dfx[['CustomerID','StockCode','Norm_Tot_Amnt']],reader=reader_x) #for i in range(9): # print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0]) print 'difference in processed and pre-processed dataset = ',(data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0]) import time start_time = time.time() #param_grid = {'n_factors':[2,5,10,50],'n_epochs': [10,50,100], 'lr_bu': [0.1,0.01,0.001,0.0001],'lr_bi': [0.1,0.01,0.001,0.0001],'reg_bi': [0.1,0.01,0.001,0.0001],'reg_bu': [0.1,0.01,0.001,0.0001],'reg_qi': [0.1,0.01,0.001,0.0001],'reg_pu': [0.1,0.01,0.001,0.0001]} param_grid = {'n_factors':[5,10,50,100],'n_epochs': [5,10,20,50,100], 'lr_all': [0.1,0.01,0.001],'reg_all': [0.1,0.01,0.001} grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=1)
del df4['geo_point_2d'] df_f = df4.join(df5) df_i = df_f.loc[df_f["ARRONDISSEMENT"].str.contains('paris') == True] df_i = df_i.drop(204726) df_i['ARRONDISSEMENT'] = encoder.fit_transform(df_i['ARRONDISSEMENT']) df_i['LIEU/ADRESSE'] = encoder.fit_transform(df_i['LIEU/ADRESSE']) df_i['STADE'] = encoder.fit_transform(df_i['STADE']) df_a = df_i.loc[df_i["ALLERGIE"] == 1] reader = Reader(rating_scale=(1, 164151)) df_etude_2 = Dataset.load_from_df(df_a[['LATITUDE', 'LONGITUDE', 'GENRE']], reader) X = StandardScaler().fit_transform(df_a) algo5 = DBSCAN(eps=0.3, min_samples=7).fit(X) labels = algo5.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) train_2, test_2 = train_test_split(df_etude_2, test_size=.25) algo = SVD() predictions_2 = algo.fit(train_2).test(test_2) lat = [] lng = [] for i in predictions_2:
head(10) #%% Most active users -- Check correlation of numbers with rating/time? ratings.\ groupby('User')['Recipe'].\ count().\ sort_values(ascending=False).\ head(10) #%% Distribution of Ratings print(ratings.Rating.describe()) print(set(ratings.Rating)) #%% Build train - test split reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings, reader) random.seed(42) random.shuffle(data.raw_ratings) cut_off = int(len(data.raw_ratings) * 0.75) train_ratings = data.raw_ratings[:cut_off] test_ratings = data.raw_ratings[cut_off:] data.raw_ratings = train_ratings #%% Evaluate baseline on all, bias and test error def evaluator(algo, df, cv_method, verbose = False): """ wrapper to streamline evaluation """
from surprise import KNNBasic from surprise import KNNWithMeans from surprise import KNNWithZScore from surprise import KNNBaseline from surprise import SVD from surprise import BaselineOnly from surprise import SVDpp from surprise import NMF from surprise import SlopeOne from surprise import CoClustering from surprise.accuracy import rmse from surprise.model_selection import train_test_split from surprise import accuracy reader = Reader(rating_scale=(1, 7)) data = Dataset.load_from_df(df_c1[['Smart Card_', 'Class.1_', 'freq']], reader) # getting the most effective Algorithm for Recommendation System benchmark = [] for algorithm in [ SVD(), NMF(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()