def __init__(self, data_items, ratings_train, ratings_validation): self.name = 'SVD_Explainable_2' self.data_items = data_items self.ratings_train = ratings_train self.ratings_validation = ratings_validation self.explanations_matrix = pd.read_csv('explanation_matrix_user_based.csv') self.svd = SVD(self.explanations_matrix, learning_rate=0.005, regularization=0.005, n_epochs=1000, n_factors=15, min_rating=1, max_rating=2, lambda_=0.000) self.svd.fit(X=ratings_train,X_val=ratings_validation, early_stopping=True, shuffle=False)
def save_factorized(): svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=10000, n_factors=15, min_rating=1, max_rating=5) svd.fit(X=rating) print("finish computing factorization") saveFileToPickle('user.pkl', svd.pu) saveFileToPickle('book.pkl', svd.qi)
def train_model(df): train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=1.0, random_state=8) svd = SVD(learning_rate=0.1, regularization=0.005, n_epochs=10, n_factors=10, min_rating=1, max_rating=10) svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False) outfile = open('svd_model', 'wb') pickle.dump(svd, outfile) outfile.close()
def main(): # Hyperparameters m = 5 # Number of bootstrap resamples irt_iters = 30 irt_lr = 0.009 svd_lr = 0.01 svd_reg = 0.10 svd_k = 50 svd_iters = 500 train_data = load_train_csv("../data") val_data = load_valid_csv("../data") test_data = load_private_test_csv('../data') val_svd = {'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct']} test_svd = {'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct']} svd_train_resamples = generate_resamples(train_data, m) irt_train_resamples = generate_resamples(train_data, m) svd_test_pred, irt_test_pred = [], [] for i in range(m): curr_irt, curr_svd = irt_train_resamples[i], svd_train_resamples[i] # Train 2-PL IRT theta, a, beta, train_acc, val_acc, train_log_likes, val_log_likes, final = \ irt(curr_irt, val_data, irt_lr, irt_iters) irt_test_pred.append(irt_predict(test_data, theta, a, beta)[0]) # Train Funk SVD curr_svd = {'u_id': curr_svd['user_id'], 'i_id': curr_svd['question_id'], 'rating': curr_svd['is_correct']} svd = SVD(learning_rate=svd_lr, regularization=svd_reg, n_epochs=svd_iters, n_factors=svd_k, min_rating=0, max_rating=1) svd.fit(X=pd.DataFrame(curr_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) svd_test_pred.append(svd.predict(test_svd)) test_avg = np.sum(irt_test_pred + svd_test_pred, axis=0) / (2 * m) binary_pred = [0 if x < 0.5 else 1 for x in test_avg] test_data['is_correct'] = binary_pred save_private_test_csv(test_data)
def svd(data, svd_data, lr=0.01, reg=0.1, k=10, iters=500): train_data, val_data = data['train_data'], data['val_data'] train_svd, val_svd = svd_data['train_svd'], svd_data['val_svd'] svd = SVD(learning_rate=lr, regularization=reg, n_epochs=iters, n_factors=k, min_rating=0, max_rating=1) svd.fit(X=pd.DataFrame(train_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) # Train Accuracy pred = svd.predict(train_svd) train_acc = evaluate(train_data, pred) # Validate Accuracy pred = svd.predict(val_svd) val_acc = evaluate(val_data, pred) return train_acc, val_acc
idx = movie_id.index(row['i_id']) # 找到该位置 movie_count[idx] += 1 # 计数加一 movie_total_rating[idx] += row['rating'] else: # 否则新加入movie_id movie_id.append(row['i_id']) movie_count.append(1) movie_total_rating.append(row['rating']) print('Total movie count:', len(movie_id)) # Funk SVD for item representation train = data[data['u_id'].isin(train_id)] test = data[data['u_id'].isin(test_id)] svd = SVD(learning_rate=1e-3, regularization=0.005, n_epochs=200, n_factors=128, min_rating=0, max_rating=5) svd.fit(X=data, X_val=test, early_stopping=True, shuffle=False) item_matrix = svd.qi def get_feature(input_id): # 根据输入的movie_id得出相应的feature movie_index = np.where(movie_id == input_id) return item_matrix[movie_index] def action_mapping(input_id): ''' convert input movie id to index
def main(): train_data = load_train_csv("../data") val_data = load_valid_csv("../data") test_data = load_public_test_csv("../data") train_svd = { 'u_id': train_data['user_id'], 'i_id': train_data['question_id'], 'rating': train_data['is_correct'] } val_svd = { 'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct'] } test_svd = { 'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct'] } data = {"train_data": train_data, "val_data": val_data} svd_data = {"train_svd": train_svd, "val_svd": val_svd} lrs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5] regs = [0.001, 0.01, 0.05, 0.1, 0.5, 1] ks = [1, 5, 10, 20, 50, 100] iters = [10, 50, 100, 500, 1000, 2000] lr_train_results = [] lr_val_results = [] reg_train_results = [] reg_val_results = [] ks_train_results = [] ks_val_results = [] iters_train_results = [] iters_val_results = [] for lr in lrs: train_result, val_result = svd(data, svd_data, lr=lr) lr_train_results.append(train_result) lr_val_results.append(val_result) for reg in regs: train_result, val_result = svd(data, svd_data, reg=reg) reg_train_results.append(train_result) reg_val_results.append(val_result) for k in ks: train_result, val_result = svd(data, svd_data, k=k) ks_train_results.append(train_result) ks_val_results.append(val_result) for iter in iters: train_result, val_result = svd(data, svd_data, iters=iter) iters_train_results.append(train_result) iters_val_results.append(val_result) best_lr = lrs[lr_val_results.index(max(lr_val_results))] print("Best learning rate: ", best_lr) best_reg = regs[reg_val_results.index(max(reg_val_results))] print("Best regularization value: ", best_reg) best_k = ks[ks_val_results.index(max(ks_val_results))] print("Best k: ", best_k) best_iter = iters[iters_val_results.index(max(iters_val_results))] print("Best iterations: ", best_iter) plot(lrs, lr_train_results, lr_val_results, "Learning Rates") plot(regs, reg_train_results, reg_val_results, "Regularized Rates") plot(ks, ks_train_results, ks_val_results, "K-Values") plot(iters, iters_train_results, iters_val_results, "Iterations") final_svd = SVD(learning_rate=best_lr, regularization=best_reg, n_epochs=best_iter, n_factors=best_k, min_rating=0, max_rating=1) final_svd.fit(X=pd.DataFrame(train_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) # Train Accuracy pred = final_svd.predict(train_svd) train_acc = evaluate(train_data, pred) print("Final Train Accuracy: ", train_acc) # Validate Accuracy pred = final_svd.predict(val_svd) val_acc = evaluate(val_data, pred) print("Final Validation Accuracy: ", val_acc) # Test Accuracy pred = final_svd.predict(test_svd) test_acc = evaluate(test_data, pred) print("Final Test Accuracy: ", test_acc)
import pandas as pd import numpy as np from funk_svd.dataset import fetch_ml20m_ratings from funk_svd import SVD from sklearn.metrics import mean_absolute_error df = fetch_ml20m_ratings() train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8) test = df.drop(train.index.tolist()).drop(val.index.tolist()) svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=15, min_rating=1, max_rating=5) svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False) pred = svd.predict(test) mae = mean_absolute_error(test["rating"], pred) print("Test MAE: {:.2f}".format(mae))
class SVD_Explainable_2(Strategy): def __init__(self, data_items, ratings_train, ratings_validation): self.name = 'SVD_Explainable_2' self.data_items = data_items self.ratings_train = ratings_train self.ratings_validation = ratings_validation self.explanations_matrix = pd.read_csv('explanation_matrix_user_based.csv') self.svd = SVD(self.explanations_matrix, learning_rate=0.005, regularization=0.005, n_epochs=1000, n_factors=15, min_rating=1, max_rating=2, lambda_=0.000) self.svd.fit(X=ratings_train,X_val=ratings_validation, early_stopping=True, shuffle=False) # self.predicted_matrix = pd.DataFrame(self.mf.full_matrix(), index=data_items.index, columns=data_items.columns) def get_users_of_project(self,project): users_of_project = self.data_items[project] users_of_project = users_of_project[users_of_project > 0].index.values return users_of_project def get_user_projects(self, user_id): known_user_likes = self.data_items.loc[user_id] known_user_likes = known_user_likes[known_user_likes > 0].index.values return known_user_likes def calc_explanation_score_user_based(self,user_id,project,cf_user_user): k=50 similar_users = cf_user_user.find_k_similar_users(user_id, k=k).index user_liked_project = self.get_users_of_project(project) return len(np.intersect1d(similar_users, user_liked_project))/len(similar_users) def calc_explanation_score_item_based(self,user_id,project,cf_item_item): k=10 similar_projects = cf_item_item.get_k_similar_projects(project, k=k) known_user_projects = self.get_user_projects(user_id) return len(np.intersect1d(similar_projects, known_user_projects))/len(similar_projects) def get_explanations_matrix(self): i=0 #cf_item_item = CFItemItem(self.data_items) cf_user_user = CFUserUser(self.data_items) explanation_matrix = pd.DataFrame(0, columns=self.data_items.columns, index=self.data_items.index) print (explanation_matrix.shape) for user_id in explanation_matrix.index: print (i) i += 1 for project in explanation_matrix.columns: explanation_matrix.loc[user_id][project] = self.calc_explanation_score_user_based(user_id, project,cf_user_user) return explanation_matrix def get_recommendations(self, user_index, known_user_projects, k, ip_address): projects_predicted_ratings = \ [[project, self.svd.predict_pair(user_index, project, clip=False)] for project in self.data_items.columns if project not in known_user_projects] # projects_predicted_ratings = \ # [[project, self.predicted_matrix.loc[user_index][project]] # for project in self.data_items.columns # if project not in known_user_projects] projects_predicted_ratings = sorted(projects_predicted_ratings, key=lambda i: i[1], reverse=True) self.projects_predicted_ratings = projects_predicted_ratings self.user = user_index projects_predicted_ratings = [i[0] for i in projects_predicted_ratings] projects_predicted_ratings = self.remove_non_active_projects(projects_predicted_ratings) # projects_predicted_ratings = self.remove_unreachable_projects(projects_predicted_ratings, ip_address) return projects_predicted_ratings[:k] @staticmethod def remove_non_active_projects(recommended_projects): from Recommender import non_active_projects return [project for project in recommended_projects if project not in non_active_projects['project'].values] @staticmethod def remove_unreachable_projects(recommended_projects, ip_address): user_loc = get_user_loc(ip_address) return [project for project in recommended_projects if is_project_reachable_to_user(user_loc, project)] def get_highest_online_project(self): from Recommender import is_online_project, recommend_default_online online_similar_projects = list(filter(lambda x: is_online_project(x[0]), self.projects_predicted_ratings)) if len(online_similar_projects) == 0: return recommend_default_online(self.user) return online_similar_projects[0][0]
from funk_svd.dataset import fetch_ml_ratings from funk_svd import SVD from sklearn.metrics import mean_absolute_error df = fetch_ml_ratings(variant='100k') train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8) test = df.drop(train.index.tolist()).drop(val.index.tolist()) svd = SVD(lr=0.001, reg=0.005, n_epochs=100, n_factors=15, early_stopping=True, shuffle=False, min_rating=1, max_rating=5) svd.fit(X=train, X_val=val) pred = svd.predict(test) mae = mean_absolute_error(test['rating'], pred) print(f'Test MAE: {mae:.2f}')
import numpy as np from funk_svd.dataset import fetch_ml_ratings from funk_svd import SVD from sklearn.metrics import mean_absolute_error df = fetch_ml_ratings(variant='100k') train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8) test = df.drop(train.index.tolist()).drop(val.index.tolist()) svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=15, min_rating=1, max_rating=5) df_matrix_original = svd.get_utility_matrix(df) print("Original Utility Matrix: \n", df_matrix_original.values) # Getting all u_id and i_id combinations df_user_item = pd.melt(df_matrix_original.reset_index(drop=False), id_vars='u_id') svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False) pred_test = svd.predict(test) df_user_item["rating"] = svd.predict(df_user_item)