from surprise import SVD, KNNBasic, KNNWithMeans, KNNBaseline, NMF, SlopeOne, CoClustering, BaselineOnly, NormalPredictor
'''
    "SVD" -- https://en.wikipedia.org/wiki/Singular_value_decomposition
    "KNN" -- https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
    "Centered KNN" -- KNN with mean user ratings considered 
    "KNN with Baseline" -- KNN with baseline considered 
    "NMF" -- https://en.wikipedia.org/wiki/Non-negative_matrix_factorization
    "SlopeOne" -- https://en.wikipedia.org/wiki/Slope_One
    "CoClustering" -- https://en.wikipedia.org/wiki/Biclustering
    "BaselineOnly" -- baseline predicted for specific user/item
    "NormalPredictor" -- predict random rating from normal distribution 

    https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly

'''
labels = [
    "SVD", "KNN", "Centered KNN", "KNN with Baseline", "NMF", "SlopeOne",
    "CoClustering", "BaselineOnly", "NormalPredictor"
]

algorithms = [
    SVD(),
    KNNBasic(),
    KNNWithMeans(),
    KNNBaseline(),
    NMF(),
    SlopeOne(),
    CoClustering(),
    BaselineOnly(),
    NormalPredictor()
]
Exemple #2
0
df_cust_summary = df.groupby('Cust_id')['Rating'].agg(["count","mean"])
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

df = df[~df['Movie_id'].isin(drop_movie_list)]
df = df[~df['Cust_id'].isin(drop_cust_list)]

#Pivot data
df_p = pd.pivot_table(df, index="Cust_id", columns="Movie_id", values="Rating")

#See which algorithm gives the lowest RMSE value
reader = Reader()
data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']][:100000], reader)
benchmark = []
for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]:
    data.split(n_folds=3)
    results = evaluate(algo, data, measures = ["RMSE"])
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('rmse'))

##Train and Test split
#reader = Reader()
#data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']], reader)
#trainset, testset = train_test_split(data, test_size = 0.25)
#blo = BaselineOnly()
#blo.fit(trainset)
 def set_algo(self):
     self.algo = CoClustering(n_cltr_u=10)
Exemple #4
0
def run(algorithm,min_nr_movies_train, min_nr_movies_test, netflix=True):

	if netflix:

		trainset, testset = getNetflix()

	else:



	if algorithm == 'SVD':

		model = SVD()

	elif algorithm == 'KNNBasic':

		model = KNNBasic()

	elif algorithm == 'KNNWithZScore':

		model = KNNWithZScore()

	elif algorithm == 'NMF':

		model = NMF()

	elif algorithm == 'CoClustering':

		model = CoClustering()

	elif algorithm == 'SlopeOne':

		model = SlopeOne()

	model.fit(trainset)
	preds = model.test(testset)

	predsDF = pd.DataFrame(preds)
	predsDF['userID'] = predsDF['uid']
	predsDF['itemID'] = predsDF['iid']
	predsDF['pred'] = predsDF['est']

	full = pd.merge(predsDF, testData, on = ['userID','itemID'], how = 'inner')

	out = []
	for pred, rating in zip(round(full['pred']), full['rating']):
	    
	    if pred==rating:
	    #if (pred + 1 == rating) or (pred - 1 == rating) or (pred == rating):
	        
	        out.append(1)
	    else:
	        out.append(0)

	acc = sum(out) / len(out)

	return 'Model:{}, Accuracy: {}, Num Movies: {}/{}'.format(algorithm, round(acc,3) * 100, min_nr_movies_train, min_nr_movies_test)

if __name__ == "__main__":

	print(run('CoClustering',10, 5))
	print(run('CoClustering',20, 5))
	print(run('CoClustering',20, 7))
	print(run('CoClustering',20, 10))	

	print(run('KNNBasic',10, 5))
	print(run('KNNBasic',20, 5))
	print(run('KNNBasic',20, 7))
	print(run('KNNBasic',20, 10))

	print(run('KNNWithZScore',10, 5))
	print(run('KNNWithZScore',20, 5))
	print(run('KNNWithZScore',20, 7))
	print(run('KNNWithZScore',20, 10))	

	print(run('NMF',10, 5))
	print(run('NMF',20, 5))
	print(run('NMF',20, 7))
	print(run('NMF',20, 10))	

	print(run('SlopeOne',10, 5))
	print(run('SlopeOne',20, 5))
	print(run('SlopeOne',20, 7))
	print(run('SlopeOne',20, 10))
	
	print(run('SVD',10, 5))
	print(run('SVD',20, 5))
	print(run('SVD',20, 7))
	print(run('SVD',20, 10))
def best_pred():
    review['새주소'] = review['장소'] + "*" + review['주소']
    review2 = review.drop([
        '장소', '주소', '위도', '경도', '분류', '대분류', '주소1', '주소2', '방문횟수', '년도', '월',
        '계절'
    ],
                          axis=1)
    review2 = review2[['이름', '새주소', '별점']]

    # 데이터 셋의 차원 줄이기
    # 저조한 평가를 기록한 장소 및 사용자 제외
    min_ratings = 50
    filter_review = review2['새주소'].value_counts() > min_ratings
    filter_review = filter_review[filter_review].index.tolist()

    min_user_ratings = 50
    filter_users = review2['이름'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()

    review_new = review2[(review2['새주소'].isin(filter_review))
                         & (review2['이름'].isin(filter_users))]

    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(review_new[['이름', '새주소', '별점']], reader)

    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(), KNNWithZScore,
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        algo = NMF()
        results = cross_validate(algo,
                                 data,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)
        trainset, testset = train_test_split(data, test_size=0.25)
        predictions = algo.fit(trainset).test(testset)
        # accuracy.rmse(predictions)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    surprise_results = pd.DataFrame(benchmark).set_index(
        'Algorithm').sort_values('test_rmse')

    # Train and Predict
    # CoClustering 알고리즘이 가장 좋은 rmse 결과를 보였다. 따라서 CoClustering 사용하여
    # 훈련 및 예측을 진행하고 교대최소제곱(ALS)를 사용할 것
    algo = NMF()
    cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

    # rmse 정확도 훈련셋과 검증셋을 샘플링하기위해 train_Test_split()을 사용
    # rmse 정확도 척도를 사용
    # fit() 메소드를 통해 훈련셋의 알고리즘을 훈련시키고, test() 메소드를 통해 검증셋으로부터
    # 생성된 예측을 반환
    trainset, testset = train_test_split(data, test_size=0.25)
    # algo = BaselineOnly(bsl_options=bsl_options)
    algo = NMF()
    predictions = algo.fit(trainset).test(testset)

    # dump.dump('./dump_file',predictions, algo)
    # predictions, algo = dump.load('./dump_file')

    trainset = algo.trainset

    # 예측을 정확히 살펴보기 위해, 모든 예측에 대한 데이터프레임 생성

    def get_Iu(uid):
        try:
            return len(trainset.ur[trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(iid):
        try:
            return len(trainset.ir[trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    df = pd.DataFrame(predictions,
                      columns=['uid', 'iid', 'rui', 'est', 'details'])
    df['Iu'] = df.uid.apply(get_Iu)
    df['Ui'] = df.iid.apply(get_Ui)
    df['err'] = abs(df.est - df.rui)

    predictions = df.sort_values(by='err').drop_duplicates('iid')

    best_predictions = predictions[:100]
    worst_predictions = predictions[-10:]

    # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm']))
    best_predictions['iid'] = best_predictions.iid.str.split('*').str[0]

    sql = "insert into rec(rec_uid, rec_iid, rec_rui, rec_est) values(:rec_uid, :rec_iid, :rec_rui, :rec_est)"
    data = best_predictions[['uid', 'iid', 'rui', 'est']]
    data.columns = ['rec_uid', 'rec_iid', 'rec_rui', 'rec_est']
    cursor.close()
    conn.close()
    return data
Exemple #6
0
class Surprise():
    def train(self, algo='SVD', like=True, test='cv', local=False):

        if local:
            csv_path = os.path.join(os.path.dirname(__file__),
                                    "data/preprocessed")
            self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv")
            self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv")
        else:
            self.recipes = storage.import_file('data/preprocessed',
                                               'recipe_pp.csv')
            self.reviews = storage.import_file('data/preprocessed',
                                               'review_pp.csv')

        if like:
            self.target = 'liked'
            self.s_min = 0
            self.s_max = 1
        else:
            self.target = 'rating'
            self.s_min = 1
            self.s_max = 5

        reader = Reader(rating_scale=(self.s_min, self.s_max))

        self.relevant_data = self.reviews[[
            'user_id', 'recipe_id', self.target
        ]]
        model_data = Dataset.load_from_df(self.relevant_data, reader)

        # Algos

        if 'NormalPredictor':
            self.algorithm = NormalPredictor()

        elif 'BaselineOnly':
            self.algorithm = BaselineOnly()

        elif 'KNNBasic':
            self.algorithm = KNNBasic()

        elif 'KNNWithMeans':
            self.algorithm = KNNWithMeans()

        elif 'KNNWithZScore':
            self.algorithm = KNNWithZScore()

        elif 'KNNBaseline':
            self.algorithm = KNNBaseline()

        elif 'SVD':
            params = {
                'n_epochs': 20,
                'n_factors': 100,
                'lr_all': 0.002,
                'reg_all': 0.02
            }
            self.algorithm = SVD(params)  # Tuned with svd_grid

        elif 'SVDpp':
            self.algorithm = SVDpp()

        elif 'NMF':
            self.algorithm = NMF()

        elif 'SlopeOne':
            self.algorithm = SlopeOne()

        elif 'CoClustering':
            self.algorithm = CoClustering()

        if test == 'cv':
            cv_results = cross_validate(self.algorithm,
                                        model_data,
                                        measures=['RMSE', 'MAE'],
                                        cv=5,
                                        verbose=True)
            rmse = np.round(cv_results['test_rmse'].mean(), 3)
            mae = np.round(cv_results['test_mae'].mean(), 3)
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        elif test == 'svd_grid':
            param_grid = {
                'n_epochs': [10, 20],
                'n_factors': [100, 200],
                'lr_all': [0.001, 0.002],
                'reg_all': [0.01, 0.02]
            }
            train_data = model_data.build_full_trainset()
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(model_data)
            rmse = gs.best_score['rmse']
            mae = gs.best_score['mae']
            print(gs.best_params['rmse'], gs.best_params['mae'])
            self.algorithm = gs.best_estimator['rmse']
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        else:
            train, test = train_test_split(model_data,
                                           test_size=0.3,
                                           random_state=42)
            self.algorithm.fit(train)
            predictions = self.algorithm.test(test)
            rmse = np.round(accuracy.rmse(predictions), 3)
            mae = np.round(accuracy.mae(predictions), 3)

        return rmse, mae

    def predict(self, user_id):

        inputs = self.relevant_data[self.relevant_data['user_id'] == user_id] \
                 .merge(self.recipes, on="recipe_id", how="left")[['recipe_id', 'name', self.target]]

        display(inputs)

        user_recipes = self.relevant_data[self.relevant_data['user_id'] ==
                                          user_id].recipe_id.unique()
        recipe_list = self.relevant_data[
            self.relevant_data['user_id'] != user_id].recipe_id.unique()
        predictions = [
            self.algorithm.predict(user_id, rec) for rec in recipe_list
            if rec not in list(user_recipes)
        ]

        pdf = pd.DataFrame(predictions,
                           columns=[
                               'user_id', 'recipe_id', self.target,
                               f'rec_{self.target}', 'details'
                           ])
        pdf = pdf.drop(columns=[self.target, 'details'])
        pdf = pdf.sort_values(f'rec_{self.target}', ascending=False)

        rec_target = pdf[f'rec_{self.target}']
        pdf['rec_score'] = (rec_target - self.s_min) / (self.s_max -
                                                        self.s_min)

        outputs = pdf.merge(self.recipes, on="recipe_id", how="left")[[
            'recipe_id', 'name', f'rec_{self.target}', 'rec_score'
        ]]

        display(outputs.head(10))

        return outputs
Exemple #7
0
print('Here are the top 5 recommendations based on Slope-One! ')

for i in range(5):

    movieRecc2 = topMovies2[i]
    movieRawID2 = movieRecc2[0]
    movieName2 = movie[movieRawID2]
    print(str(i+1) + '. ' + movieName2 )


#############predictions using Co-Clustering
print('')
print('Making more recommendations...')


algo3 = CoClustering()
algo3.fit(trainset)

predictions3 = algo3.test(testset)
dictMovies3 = get_top_n(predictions3)
topMovies3 = dictMovies3.get(672)

print('')
print('Here are the top 5 recommendations based on Co-Clustering! ')

for i in range(5):

    movieRecc3 = topMovies3[i]
    movieRawID3 = movieRecc3[0]
    movieName3 = movie[movieRawID3]
    print(str(i+1) + '. ' + movieName3 )
from surprise import SVD, CoClustering, NMF
from surprise import KNNBasic, KNNWithMeans
from numpy import tensordot
from numpy.linalg import norm
from itertools import product
from PIL import Image

sns.set()
pd.set_option('display.expand_frame_repr', False)
labelencoder_PID = LabelEncoder()
labelencoder_UID = LabelEncoder()

svdModel = SVD(n_factors = 20, n_epochs = 10, biased=True)
sim_options = {'name': 'msd', 'user_based': False}
knnBasicModel = KNNBasic(k =10, sim_options=sim_options)
coCluster = CoClustering(n_cltr_u = 10, n_cltr_i = 10)
nmfModel = NMF(n_factors = 10, n_epochs = 40, biased=True)
predictionsEn = []
listOfBooksReadByTop10 = []
recBooksEn = []

def getDataFromFile():
    df = pd.read_csv("goodbooks-10k/ratings.csv")
    threeabvrating = df[df["rating"]>=3]
    books = threeabvrating.groupby("book_id").agg({"user_id":"count", "rating" : "mean"}).reset_index().rename(columns = {"user_id" : "count_users","rating": "avg_rating"})
    sorted_bks = books.sort_values(by=['count_users', 'avg_rating'], ascending=False).reset_index()
    top500famousBookIds = sorted_bks.book_id.unique()[:500]
    famousBooks = df[df['book_id'].isin(top500famousBookIds)]
    countUsers = famousBooks.groupby("user_id").agg({"rating" : "count"}).reset_index().rename(columns = {"rating":"count"})
    top500users =  countUsers.sort_values(by = "count", ascending=False).reset_index().user_id.unique()[:500]
    df = famousBooks[famousBooks['user_id'].isin(top500users)].reset_index()
models.append('KNN with cosine')


# ## CoClustering Implementation

# In[88]:


# We'll use the famous SVD algorithm.
from surprise import CoClustering

df_CoClustering = df_final_user_repo_star_v3.copy(deep=True);
dataCoClustering = Dataset.load_from_df(df_CoClustering, reader)


coClustering = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20)

# Train the algorithm on the trainset, and predict ratings for the testset
trainsetcoClustering  = dataCoClustering.build_full_trainset()

coClustering.fit(trainsetcoClustering)

testcoClustering = trainsetcoClustering.build_anti_testset()
predictionscoClustering = coClustering.test(testcoClustering)

accuracy.rmse(predictionscoClustering)


listOfRMSE.append(accuracy.rmse(predictionscoClustering))
models.append('CoClustering')
#%%
# Use the new parameters with the sampled training data
algo_svdpp = SVDpp(lr_all = 0.01, reg_all = 0.15)
fit_rmse(algo_svdpp, tr_dat)
algo_svdpp.fit(tr_dat.build_full_trainset())

#%%
algo_svdpp_new = SVDpp(lr_all = 0.01, reg_all = 0.1)
fit_rmse(algo_svdpp_new, tr_dat)

#%%
output(algo_svdpp, "SVDpp_lr0.01_reg0.15.csv")
output(algo_svdpp_new, "SVDpp_lr0.01_reg0.10.csv")
#%%
# CoClustering
algo_cc = CoClustering()
fit_rmse(algo_cc, tr_dat)
output(algo_cc, "CoClustering.csv")

#%%
# KNNWithMeans
algo_knnwm = KNNWithMeans(k = 40, sim_options = {'name': 'cosine', 'user_based': False})
fit_rmse(algo_knnwm, samp_dat)

# Gridsearch 
param_grid_knnwm = {'k': [30, 40, 50], 
                    'sim_options': {'name':['cosine', 'pearson'],
                                    'user_based':[False]}}
gs_knnwm = GridSearchCV(KNNWithMeans, param_grid_knnwm, measures = ['rmse','mae'], cv = 3)
gs_knnwm.fit(samp_dat)
Exemple #11
0
    def getRecommendations(self,
                           IDUser,
                           method=9,
                           similarityMeasure=1,
                           isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset = data.build_full_trainset()

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)

        predictions = pd.DataFrame(columns=['glassID', 'estimatedRating'])

        totalGlass = df['glassID'].max()

        glassPivot = df.pivot_table(index='glassID',
                                    columns='userID',
                                    values='relativeRating')

        for iid in range(1, totalGlass + 1):
            isNan = True

            try:
                isNan = pd.isna(glassPivot.loc[iid, IDUser])
            except:
                continue

            if isNan:
                prediction = algo.predict(IDUser, iid, verbose=False)
                predictions = predictions.append(
                    pd.DataFrame([[iid, prediction[3]]],
                                 columns=predictions.columns))

        predictions = predictions.sort_values('estimatedRating',
                                              ascending=False)
        recommendationList = [
            item for item in predictions[predictions['estimatedRating'] > 3]
            ['glassID'].head(50).tolist()
        ]

        conn.close()

        return recommendationList
from surprise import KNNBasic
bsl_options = {
    'method': 'als',
    'n_epochs': 10,
}
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBasic(k=5, bsl_options=bsl_options, sim_options=sim_options)

# In[26]:

bsl_options = {'method': 'sgd', 'lr': 0.01}
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBasic(k=5, bsl_options=bsl_options, sim_options=sim_options)

# In[22]: SlopeOne Algo

from surprise import SVD, SlopeOne

algo = SlopeOne()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

# In[6]: CoClustering

from surprise import CoClustering
from surprise.model_selection import cross_validate

algo = CoClustering(n_cltr_u=3, n_cltr_i=3)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
    def train_surprise(self,
                       model_type,
                       trainset,
                       testset,
                       k_recommend,
                       sql_db,
                       k_fold,
                       knowledge,
                       model_name,
                       result_name,
                       system_eval=False):

        knn_user_based = self.config['SURPRISE_KNN'].getboolean(
            'knn_user_based')
        knn_similarity = self.config['SURPRISE_KNN']['knn_similarity']
        sim_options = {'name': knn_similarity, 'user_based': knn_user_based}
        verbose_switch = self.config['DEFAULT'].getboolean('verbose_switch')
        # Selección de modelo a utilizar
        if (model_type == "svd"):
            # Obtener valores de configuracion
            svd_grid_search = self.config['SURPRISE_SVD'].getboolean(
                'svd_grid_search')
            svd_grid_metric = self.config['SURPRISE_SVD']['svd_grid_metric']
            svd_n_factors = int(self.config['SURPRISE_SVD']['svd_n_factors'])
            svd_n_epochs = int(self.config['SURPRISE_SVD']['svd_n_epochs'])
            svd_biased = self.config['SURPRISE_SVD'].getboolean('svd_biased')
            svd_init_mean = float(self.config['SURPRISE_SVD']['svd_init_mean'])
            svd_init_std_dev = float(
                self.config['SURPRISE_SVD']['svd_init_std_dev'])
            svd_lr_all = float(self.config['SURPRISE_SVD']['svd_lr_all'])
            svd_reg_all = float(self.config['SURPRISE_SVD']['svd_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svd_params', sql_db) == True):
                results = pd.read_sql_query('select * from svd_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svd")]
                if (real_results.empty == False):
                    svd_n_factors = int(real_results.iloc[0]['svd_n_factors'])
                    svd_n_epochs = int(real_results.iloc[0]['svd_n_epochs'])
                    svd_init_std_dev = float(
                        real_results.iloc[0]['svd_init_std_dev'])
                    svd_lr_all = float(real_results.iloc[0]['svd_lr_all'])
                    svd_reg_all = float(real_results.iloc[0]['svd_reg_all'])

            algo = SVD(n_factors=svd_n_factors,
                       n_epochs=svd_n_epochs,
                       biased=svd_biased,
                       init_mean=svd_init_mean,
                       init_std_dev=svd_init_std_dev,
                       lr_all=svd_lr_all,
                       reg_all=svd_reg_all,
                       verbose=verbose_switch)

        elif (model_type == "SVDpp"):
            # Obtener valores de configuracion
            svdpp_grid_search = self.config['SURPRISE_SVDPP'].getboolean(
                'svdpp_grid_search')
            svdpp_grid_metric = self.config['SURPRISE_SVDPP'][
                'svdpp_grid_metric']
            svdpp_n_factors = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_factors'])
            svdpp_n_epochs = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_epochs'])
            svdpp_init_mean = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_mean'])
            svdpp_init_std_dev = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_std_dev'])
            svdpp_lr_all = float(self.config['SURPRISE_SVDPP']['svdpp_lr_all'])
            svdpp_reg_all = float(
                self.config['SURPRISE_SVDPP']['svdpp_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svdpp_params', sql_db) == True):
                results = pd.read_sql_query('select * from svdpp_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svdpp")]
                if (real_results.empty == False):
                    svdpp_n_factors = int(
                        real_results.iloc[0]['svdpp_n_factors'])
                    svdpp_n_epochs = int(
                        real_results.iloc[0]['svdpp_n_epochs'])
                    svdpp_init_std_dev = float(
                        real_results.iloc[0]['svdpp_init_std_dev'])
                    svdpp_lr_all = float(real_results.iloc[0]['svdpp_lr_all'])
                    svdpp_reg_all = float(
                        real_results.iloc[0]['svdpp_reg_all'])

            algo = SVDpp(n_factors=svdpp_n_factors,
                         n_epochs=svdpp_n_epochs,
                         init_mean=svdpp_init_mean,
                         init_std_dev=svdpp_init_std_dev,
                         lr_all=svdpp_lr_all,
                         reg_all=svdpp_reg_all,
                         verbose=verbose_switch)

        elif (model_type == "NMF"):
            # Obtener valores de configuracion
            nmf_grid_search = self.config['SURPRISE_NMF'].getboolean(
                'nmf_grid_search')
            nmf_grid_metric = self.config['SURPRISE_NMF']['nmf_grid_metric']
            nmf_n_factors = int(self.config['SURPRISE_NMF']['nmf_n_factors'])
            nmf_n_epochs = int(self.config['SURPRISE_NMF']['nmf_n_epochs'])
            nmf_biased = self.config['SURPRISE_NMF'].getboolean('nmf_biased')
            nmf_reg_pu = float(self.config['SURPRISE_NMF']['nmf_reg_pu'])
            nmf_reg_qi = float(self.config['SURPRISE_NMF']['nmf_reg_qi'])
            nmf_reg_bu = float(self.config['SURPRISE_NMF']['nmf_reg_bu'])
            nmf_reg_bi = float(self.config['SURPRISE_NMF']['nmf_reg_bi'])
            nmf_lr_bu = float(self.config['SURPRISE_NMF']['nmf_lr_bu'])
            nmf_lr_bi = float(self.config['SURPRISE_NMF']['nmf_lr_bi'])
            nmf_init_low = float(self.config['SURPRISE_NMF']['nmf_init_low'])
            nmf_init_high = int(self.config['SURPRISE_NMF']['nmf_init_high'])

            if (self.common_functions.validate_available_sql_data(
                    'nmf_params', sql_db) == True):
                results = pd.read_sql_query('select * from nmf_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "nmf")]
                if (real_results.empty == False):
                    nmf_n_factors = int(real_results.iloc[0]['nmf_n_factors'])
                    nmf_n_epochs = int(real_results.iloc[0]['nmf_n_epochs'])
                    nmf_reg_pu = float(real_results.iloc[0]['nmf_reg_pu'])
                    nmf_reg_qi = float(real_results.iloc[0]['nmf_reg_qi'])
                    nmf_init_low = float(real_results.iloc[0]['nmf_init_low'])

            algo = NMF(n_factors=nmf_n_factors,
                       n_epochs=nmf_n_epochs,
                       biased=nmf_biased,
                       reg_pu=nmf_reg_pu,
                       reg_qi=nmf_reg_qi,
                       reg_bu=nmf_reg_bu,
                       reg_bi=nmf_reg_bi,
                       lr_bu=nmf_lr_bu,
                       lr_bi=nmf_lr_bi,
                       init_low=nmf_init_low,
                       init_high=nmf_init_high,
                       verbose=verbose_switch)

        elif (model_type == "NormalPredictor"):
            algo = NormalPredictor()

        elif (model_type == "BaselineOnly"):
            algo = BaselineOnly(verbose=verbose_switch)

        elif (model_type == "KNNBasic"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbasic_params', sql_db) == True):
                results = pd.read_sql_query('select * from knnbasic_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "knnbasic")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBasic(k=knn_k,
                            min_k=knn_min_k,
                            sim_options=sim_options,
                            verbose=verbose_switch)

        elif (model_type == "KNNWithMeans"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithmeans_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithmeans_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithmeans")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithMeans(k=knn_k,
                                min_k=knn_min_k,
                                sim_options=sim_options,
                                verbose=verbose_switch)

        elif (model_type == "KNNWithZScore"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithzscore_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithzscore_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithzscore")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithZScore(k=knn_k,
                                 min_k=knn_min_k,
                                 sim_options=sim_options,
                                 verbose=verbose_switch)

        elif (model_type == "KNNBaseline"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbaseline_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnbaseline_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) &
                                       (results["algorithm"] == "knnbaseline")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBaseline(k=knn_k,
                               min_k=knn_min_k,
                               sim_options=sim_options,
                               verbose=verbose_switch)

        elif (model_type == "SlopeOne"):
            algo = SlopeOne()

        elif (model_type == "CoClustering"):
            # Obtener valores de configuracion
            cc_grid_search = self.config['SURPRISE_COCLUSTERING'].getboolean(
                'cc_grid_search')
            cc_grid_metric = self.config['SURPRISE_COCLUSTERING'][
                'cc_grid_metric']
            cc_n_cltr_u = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_u'])
            cc_n_cltr_i = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_i'])
            cc_n_epochs = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_epochs'])

            if (self.common_functions.validate_available_sql_data(
                    'coclustering_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from coclustering_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "coclustering")]
                if (real_results.empty == False):
                    cc_n_cltr_u = int(real_results.iloc[0]['cc_n_cltr_u'])
                    cc_n_cltr_i = int(real_results.iloc[0]['cc_n_cltr_i'])
                    cc_n_epochs = int(real_results.iloc[0]['cc_n_epochs'])

            algo = CoClustering(n_cltr_u=cc_n_cltr_u,
                                n_cltr_i=cc_n_cltr_i,
                                n_epochs=cc_n_epochs,
                                verbose=verbose_switch)
        else:
            return {
                "status": False,
                "result": "Defined model_type does not exist"
            }

        st = default_timer()
        print("STARTING to train model: " + str(model_name))
        algo.fit(trainset)
        train_model_runtime = default_timer() - st
        # Almacenar tiempo de proceso en base de datos
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_training",
            description="Time for model to be trained on dataset")

        # Guardar modelo
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name) == False):
            try:
                os.makedirs(self.models_path + model_name)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}
        # Almacenar modelo en file system
        #file_name =  self.models_path+model_name+"/model"
        #dump.dump(file_name, algo=algo)

        st = default_timer()
        print("STARTING to generate predictions with the trained model: " +
              str(model_name))
        predictions = algo.test(testset)
        runtime = default_timer() - st

        print(
            "Tiempo de ejecucion total de la generacion de predicciones para Surprise Time:",
            round(runtime, 2))
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_generate_recommendations",
            description="Time for predictions to be generated using the model")

        # Guardar predicciones para hibridación
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name + "/predictions/" +
                          str(k_fold)) == False):
            try:
                os.makedirs(self.models_path + model_name + "/predictions/" +
                            str(k_fold))
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}

        # Almacenar predicciones para hibridación
        eval_result = pd.DataFrame(
            columns=['user_id', 'item_id', 'r_ui', 'est'])
        for uid, iid, true_r, est, _ in predictions:
            eval_result = eval_result.append(
                {
                    'user_id': uid,
                    'item_id': iid,
                    'r_ui': true_r,
                    'est': est
                },
                ignore_index=True)
        eval_result.to_csv(path_or_buf=self.models_path + model_name +
                           "/predictions/" + str(k_fold) + "/predictions.csv",
                           encoding='latin1',
                           sep=str(u';').encode('utf-8'),
                           index=False)

        # ---------------------------

        if (system_eval == False):
            # Procesar y evaluar las recomendaciones para el modelo
            st = default_timer()
            print("STARTING to evaluate recommendations with model: " +
                  str(model_name))
            process_evaluate_result = self.evaluation.surprise_process_evaluate(
                predictions,
                knowledge,
                model_name,
                result_name,
                train_model_runtime,
                k_recommend,
                sql_db,
                k_fold,
                is_surprise=True)
            # Almacenar tiempo de proceso en base de datos
            self.common_functions.save_process_time(
                st,
                event=str(model_name) + "_evaluate_model",
                description="Time for model to be evaluated in test dataset")
            if (process_evaluate_result["status"] == True):
                del (process_evaluate_result)
                return {"status": True, "result": ""}
            else:
                del (process_evaluate_result)
                return {
                    "status": False,
                    "result":
                    "no se pudo ejecutar correctamente content_explicit"
                }
        else:
            print("decide what to do")
            #result_model.save(self.models_path+model)

        return {"status": True, "result": ""}
Exemple #14
0
file_path6 = os.path.expanduser('~/Downloads/CS5344 Project/surprise/surprise/data/ratingsProcessed6m.csv')

reader = Reader(line_format='user item rating', sep=',')

data6 = Dataset.load_from_file(file_path6, reader=reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset6, testset6 = train_test_split(data6, test_size=.25)

# Choose the algo to use to compute RMSE
algo = SVD()
algo = BaselineOnly()
algo = KNNBasic()
algo = SlopeOne()
algo = CoClustering()
algo = SVDpp()
algo = NMF()
algo = NormalPredictor()

# Train the algorithm on the trainset, and predict ratings for the testset
start = time.time()
algo.fit(trainset6)
predictions = algo.test(testset6)
accuracy.rmse(predictions)
end = time.time()

elapsed = end - start
print(elapsed)

# Then compute RMSE
gs.fit(data)

# * making the essential prints of what just happened
print("Best Score\n", gs.best_score)
print("Best Params\n", gs.best_params)
print("Best Estimators\n", gs.best_estimator)
print("Best Index\n", gs.best_index)
print("Results Dicts: \n")
results_df = pd.DataFrame.from_dict(gs.cv_results)
print(results_df)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing Co-Clustering as algorithm
algo = CoClustering()

# * Train the algorithm on the trainset, and predict ratings for the testset
for trainset, testset in kf.split(data):
    predictions = algo.fit(trainset).test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    accuracy.mse(predictions)
    accuracy.fcp(predictions)
    print("Precision: ",
          sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls))

df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"])
df["err"] = abs(df.est - df.rui)
Exemple #16
0
    def __init__(self, module_type, baseline_type, cf_type, similar, sim_type, params):
        assert baseline_type in {"ALS", "SGD", "default"}
        assert cf_type in {None, "base_user", "base_item"}
        assert similar in {None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson",
                           "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard",
                           "EUCLIDEAN", "euclidean"}
        assert sim_type in {None, "default"}
        self.module_type = module_type
        self.baseline_type = baseline_type
        self.cf_type = cf_type
        self.similar = similar
        self.sim_type = sim_type
        self.bu = None
        self.bi = None
        self.sim = None
        if self.baseline_type == "ALS":
            bsl_options = {'method': params["bsl_options"].get("method", 'als'),
                           'n_epochs': params["bsl_options"].get("n_epochs", 10),
                           'reg_u': params["bsl_options"].get("reg_u", 15),
                           'reg_i': params["bsl_options"].get("reg_i", 10)
                           }
        elif self.baseline_type == "SGD":
            bsl_options = {'method':  params["bsl_options"].get("method", 'sgd'),
                           'n_epochs': params["bsl_options"].get("n_epochs", 20),
                           'reg': params["bsl_options"].get("reg", 0.02),
                           'learning_rate': params["bsl_options"].get("learning_rate", 0.005)
                           }
        else:   # 默认值
            bsl_options = {}
        params["sim_options"] = {}

        if self.cf_type == "base_user":
            params["sim_options"]["user_based"] = True
        elif self.cf_type == "base_item":
            params["sim_options"]["item_based"] = False
        else:
            params["sim_options"]["user_based"] = True

        if self.similar == "COSINE" or self.similar == "cosine":
            params["sim_options"]["name"] = "cosine"
        elif self.similar == "MSD" or self.similar == "msd":
            params["sim_options"]["name"] = "msd"
        elif self.similar == "PEARSON" or self.similar == "pearson":
            params["sim_options"]["name"] = "pearson"
        elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline":
            params["sim_options"]["name"] = "pearson_baseline"
        elif self.similar == "JACCARD" or self.similar == "jaccard":
            params["sim_options"]["name"] = "jaccard"
        elif self.similar == "EUCLIDEAN" or self.similar == "euclidean":
            params["sim_options"]["name"] = "euclidean"
        else:
            params["sim_options"]["name"] = "msd"

        if self.sim_type == "default":
            sim_options = {}
        else:
            sim_options = {"name": params["sim_options"].get("name", "MSD"),
                           "user_based": params["sim_options"].get("user_based", True),
                           "min_support": params["sim_options"].get("min_support", 5),
                           "shrinkage": params["sim_options"].get("shrinkage", 100)
                           }

            """
            'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。
            'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。
            'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。
            简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。
            'shrinkage':
            """
        if self.module_type == "KNNmeans":
            # 在KNNBasic算法的基础上,考虑用户均值或项目均值
            self.model = KNNWithMeans(k=params.get("k", 40),
                                      min_k=params.get("min_k", 1),
                                      sim_options=sim_options,
                                      verbose=params.get("verbose", True))
        elif self.module_type == "KNNzscore":
            # 引入Z - Score的思想
            self.model = KNNWithZScore(k=params.get("k", 40),
                                       min_k=params.get("min_k", 1),
                                       sim_options=sim_options,
                                       verbose=params.get("verbose", True))
        elif self.module_type == "KNNbase":
            # 和KNNWithMeans的区别在于,用的不是均值而是bias
            self.model = KNNBaseline(k=params.get("k", 40),
                                     min_k=params.get("min_k", 1),   # 最少的邻居个数
                                     sim_options=sim_options,
                                     bsl_options=bsl_options,
                                     verbose=params.get("verbose", True))
        elif self.module_type == "KNNbasic":
            # 最基础的KNN算法,可分为user - based KNN和item - based KNN
            self.model = KNNBasic(k=params.get("k", 40),
                                  min_k=params.get("min_k", 1),
                                  sim_options=sim_options,
                                  verbose=params.get("verbose", True))
        elif self.module_type == "SVD":
            self.model = SVD(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False)
                             )
            """
            n_factors –因素数。默认值为100。
            n_epochs – SGD过程的迭代次数。默认值为 20。
            偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。
            lr_all –所有参数的学习率。默认值为0.005。
            reg_all –所有参数的正则项。默认值为 0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SVDpp":
            self.model = SVDpp(n_factors=params.get("n_factors", 100),
                               n_epochs=params.get("n_epochs", 20),
                               init_mean=params.get("init_mean", 0),
                               init_std_dev=params.get("init_std_dev", 0.1),
                               lr_all=params.get("lr_all", 0.005),
                               reg_all=params.get("reg_all", 0.02),
                               lr_bu=params.get("lr_bu", None),
                               lr_bi=params.get("lr_bi", None),
                               lr_pu=params.get("lr_pu", None),
                               lr_qi=params.get("lr_qi", None),
                               reg_bu=params.get("reg_bu", None),
                               reg_bi=params.get("reg_bi", None),
                               reg_pu=params.get("reg_pu", None),
                               reg_qi=params.get("reg_qi", None),
                               random_state=params.get("random_state", None),
                               verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为20。
            n_epochs – SGD过程的迭代次数。默认值为
            20。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0
            .1。
            lr_all –所有参数的学习率。默认值为0
            .007。
            reg_all –所有参数的正则项。默认值为
            0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用
            fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为
            None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "NMF":
            # 非负矩阵分解,即要求p矩阵和q矩阵都是正的
            self.model = NMF(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))

            """
            n_factors –因素数。默认值为15。
            n_epochs – SGD过程的迭代次数。默认值为 50。
            偏见(bool)–是否使用基线(或偏见)。默认值为 False。
            reg_pu –用户的正则化术语λu。默认值为 0.06。
            reg_qi –项目的正规化术语λi。默认值为 0.06。
            reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。
            reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。
            lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。
            lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。
            init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。
            init_high –因子的随机初始化的上限。默认值为1。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SlopeOne":
            self.model = SlopeOne(**params)

        elif self.module_type == "cc":
            # 基于聚类的协同过滤
            self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3),
                                      n_cltr_i=params.get("n_cltr_i", 3),
                                      n_epochs=params.get("n_epochs", 20),
                                      random_state=params.get("random_state", None),
                                      verbose=params.get("verbose",False)
                                      )
            """
            n_cltr_u(int)–用户集群的数量。默认值为3。
            n_cltr_i(int)–项目集群的数量。默认值为3。
            n_epochs(int)–优化循环的迭代次数。默认值为 20。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细(bool)–如果为True,则将打印当前纪元。默认值为 False。
            """

        elif self.module_type == "BaselineOnly":
            # 不考虑用户的偏好
            self.model = BaselineOnly(bsl_options=bsl_options, verbose=True)

        elif self.module_type == "Np":
            # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测,
            self.model = NormalPredictor()
Exemple #17
0
#              'n_epochs': [10,20,30,40,50,60,70,80,90,100]}


# Evaluate the model with 5-fold cross validation
#data.split(5)

#grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])
#grid_search.evaluate(data)
#print ("after grid_search.evaluate(data)")
#print_perf(perf)

#results_df = pd.DataFrame.from_dict(grid_search.cv_results)
#print(results_df) """

# create a co-clustering algorithm
algo = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=100)
algo.train(trainset)

# use the trained algorithm to predict ratings for every user in the test set
f = open('testOutput.csv', 'w')
f.write("test_id,rating\n")
for i in range(len(dftest)):
    prediction = algo.predict(dftest.at[i, 'user_id'],
                              dftest.at[i, 'business_id'],
                              r_ui=4,
                              verbose=True)
    predRating = prediction.est
    f.write(str(i) + "," + str(predRating) + '\n')

f.close()
data_cv=data
data_cv.split(n_folds=5)

# SVD test
svd = SVD()
perf = evaluate(svd, data, measures=['RMSE'])
print_perf(perf)      # MSE 0.052

param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005],
              'reg_all': [0.05, 0.1, 0.5]}
gs = GridSearch(SVD, param_svd, measures=['RMSE'])
gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1

# Co-clustering test
coc=CoClustering()
perf = evaluate(coc, data, measures=['RMSE'])
print_perf(perf)     # MSE 0.053

param_svd = {'n_cltr_u': [3, 5, 7], 'n_cltr_i': [3, 5, 7],
              'n_epochs': [10, 20]}
gs = GridSearch(CoClustering, param_svd, measures=['RMSE'])
gs.evaluate(data_cv)  # generally worse than SVD here, especially for larger cluster numbers

# Non-negative Matrix Factorization
nmf=NMF()
perf = evaluate(nmf, data, measures=['RMSE'])
print_perf(perf)    # MSE 0.053

param_svd = {'n_factors': [5, 15], 'reg_qi': [0.06, 0.1], 'biased': [True], 'reg_pu': [0.06, 0.1], 'n_epochs': [20, 50]}
gs = GridSearch(NMF, param_svd, measures=['RMSE'])
Exemple #19
0
    def train(self, algo='SVD', like=True, test='cv', local=False):

        if local:
            csv_path = os.path.join(os.path.dirname(__file__),
                                    "data/preprocessed")
            self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv")
            self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv")
        else:
            self.recipes = storage.import_file('data/preprocessed',
                                               'recipe_pp.csv')
            self.reviews = storage.import_file('data/preprocessed',
                                               'review_pp.csv')

        if like:
            self.target = 'liked'
            self.s_min = 0
            self.s_max = 1
        else:
            self.target = 'rating'
            self.s_min = 1
            self.s_max = 5

        reader = Reader(rating_scale=(self.s_min, self.s_max))

        self.relevant_data = self.reviews[[
            'user_id', 'recipe_id', self.target
        ]]
        model_data = Dataset.load_from_df(self.relevant_data, reader)

        # Algos

        if 'NormalPredictor':
            self.algorithm = NormalPredictor()

        elif 'BaselineOnly':
            self.algorithm = BaselineOnly()

        elif 'KNNBasic':
            self.algorithm = KNNBasic()

        elif 'KNNWithMeans':
            self.algorithm = KNNWithMeans()

        elif 'KNNWithZScore':
            self.algorithm = KNNWithZScore()

        elif 'KNNBaseline':
            self.algorithm = KNNBaseline()

        elif 'SVD':
            params = {
                'n_epochs': 20,
                'n_factors': 100,
                'lr_all': 0.002,
                'reg_all': 0.02
            }
            self.algorithm = SVD(params)  # Tuned with svd_grid

        elif 'SVDpp':
            self.algorithm = SVDpp()

        elif 'NMF':
            self.algorithm = NMF()

        elif 'SlopeOne':
            self.algorithm = SlopeOne()

        elif 'CoClustering':
            self.algorithm = CoClustering()

        if test == 'cv':
            cv_results = cross_validate(self.algorithm,
                                        model_data,
                                        measures=['RMSE', 'MAE'],
                                        cv=5,
                                        verbose=True)
            rmse = np.round(cv_results['test_rmse'].mean(), 3)
            mae = np.round(cv_results['test_mae'].mean(), 3)
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        elif test == 'svd_grid':
            param_grid = {
                'n_epochs': [10, 20],
                'n_factors': [100, 200],
                'lr_all': [0.001, 0.002],
                'reg_all': [0.01, 0.02]
            }
            train_data = model_data.build_full_trainset()
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(model_data)
            rmse = gs.best_score['rmse']
            mae = gs.best_score['mae']
            print(gs.best_params['rmse'], gs.best_params['mae'])
            self.algorithm = gs.best_estimator['rmse']
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        else:
            train, test = train_test_split(model_data,
                                           test_size=0.3,
                                           random_state=42)
            self.algorithm.fit(train)
            predictions = self.algorithm.test(test)
            rmse = np.round(accuracy.rmse(predictions), 3)
            mae = np.round(accuracy.mae(predictions), 3)

        return rmse, mae
Exemple #20
0
predictions = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)


# comparing models:
# Spot Check Algorithms
models = []
models.append(('AlgoBase', BaselineOnly()))
models.append(('BaselineOnly', KNNBasic()))
models.append(('KNNBasic', KNNWithMeans()))
models.append(('KNNWithMeans', KNNWithZScore()))
models.append(('KNNWithZScore', SVD()))
models.append(('SVD', NMF()))
models.append(('NMF', SlopeOne()))
models.append(('SlopeOne', CoClustering()))

# evaluate each model in turn
results = []
names = []
for name, model in models:

    # define a cross-validation iterator
    kf = KFold(n_splits=3)
    algo = model
    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
Exemple #21
0
def surprise_algorithms_print_perf():
    print('Surprise Algorithms (Tabla de resultados finales)...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)

    # BaselineOnly
    algo_normal_predictor = NormalPredictor()
    perf_normal_predictor = evaluate(algo_normal_predictor,
                                     data,
                                     measures=['RMSE', 'MAE'],
                                     verbose=False)

    # SVD
    algo_svd = SVD()
    perf_svd = evaluate(algo_svd,
                        data,
                        measures=['RMSE', 'MAE'],
                        verbose=False)

    # BaselineOnly
    algo_baseline_only = BaselineOnly()
    perf_baseline_only = evaluate(algo_baseline_only,
                                  data,
                                  measures=['RMSE', 'MAE'],
                                  verbose=False)

    # SVDpp
    algo_svdpp = SVDpp()
    perf_svdpp = evaluate(algo_svdpp,
                          data,
                          measures=['RMSE', 'MAE'],
                          verbose=False)

    # NMF
    algo_nmf = NMF()
    perf_nmf = evaluate(algo_nmf,
                        data,
                        measures=['RMSE', 'MAE'],
                        verbose=False)

    # SlopeOne
    algo_slope_one = SlopeOne()
    perf_slope_one = evaluate(algo_slope_one,
                              data,
                              measures=['RMSE', 'MAE'],
                              verbose=False)

    # CoClustering
    algo_coclustering = CoClustering()
    perf_coclustering = evaluate(algo_coclustering,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 verbose=False)
    """Segmento que utiliza KNN para el analisis:
        'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion
        'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion.
            Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones
        'sim_options' son las opciones de similitud que utiliza el knn
        'bsl_options' configuracion de las estimaciones de base"""

    k = 40
    min_k = 1
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': 0  # no shrinkage
    }
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

    algo_knn_basic = KNNBasic(k=k, min_k=k, sim_options=sim_options)
    perf_knn_basic = evaluate(algo_knn_basic,
                              data,
                              measures=['RMSE', 'MAE'],
                              verbose=False)

    algo_knn_with_means = KNNWithMeans(k=k, min_k=k, sim_options=sim_options)
    perf_knn_with_means = evaluate(algo_knn_with_means,
                                   data,
                                   measures=['RMSE', 'MAE'],
                                   verbose=False)

    algo_knn_base_line = KNNBaseline(k=k,
                                     min_k=k,
                                     sim_options=sim_options,
                                     bsl_options=bsl_options)
    perf_knn_base_line = evaluate(algo_knn_base_line,
                                  data,
                                  measures=['RMSE', 'MAE'],
                                  verbose=False)
    """Imprimiendo resultados de los algoritmos"""
    print('')
    print('Printing results from algorithms...')
    print('- Normal predictor')
    print_perf(perf_normal_predictor)
    print('')
    print('- Normal SVD')
    print_perf(perf_svd)
    print('')
    print('- Normal Baseline Only')
    print_perf(perf_baseline_only)
    print('')
    print('- Normal SVD++')
    print_perf(perf_svdpp)
    print('')
    print('- Normal NMF')
    print_perf(perf_nmf)
    print('')
    print('- Normal Slope One')
    print_perf(perf_slope_one)
    print('')
    print('- Normal Co-Clustering')
    print_perf(perf_coclustering)
    print('')
    print('- Normal KNN Basic')
    print_perf(perf_knn_basic)
    print('')
    print('- Normal KNN With Means')
    print_perf(perf_knn_with_means)
    print('')
    print('- Normal KNN Base Line')
    print_perf(perf_knn_base_line)
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
from surprise import SVD, SVDpp, SlopeOne, NMF, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, NormalPredictor, CoClustering

ratings_test = pd.read_csv('/Users/chrisjohanson/Desktop/Capstone 2/ratings.csv').set_index('rating_id')
ratings_test = ratings_test.sample(frac=1)[:200000]
print('----------Data is ready----------')

#Load the dataset, save raw ratings to variable
reader = Reader(rating_scale=(1.0, 5.0))
dataset_test = Dataset.load_from_df(ratings_test, reader)

#put together list of algorithms to test out (1 out of 3 lists total)
algorithms3 = [KNNWithZScore(), BaselineOnly(), CoClustering()]

#create empty list to store results data
benchmark = []

#iterate through each algorithm and save results info to benchmark
for algo in algorithms3:
    results = cross_validate(algo, dataset_test, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print(f"{algo} complete")

#create df with the results
results_df = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
#save the results as a csv