def get_splib_predictions(name, training_data, testing_data, kwargs): ''' name : name of the algorihm to be used. Now the KNN, Cosluster and SVD are supported. training_data : training data. If "load data" is set and the "path" is given in the kwargs, this data will be ignored testing_data : testing data use for calculating rmse of prediction. random_seed : random seed for SVD to train the recommendation system, default is kwargs : kwargs for surpriseLib, different algorithms need different args. SVD: { 'k_features', 'maxiter', 'lr_pu', 'lr_qi', 'reg_bu', 'reg_qi', 'random_seed' } KNN: { 'n_neigbor' , 'min_neigbor', 'similarity'} cluster: {'user_cluster', 'item_cluster', 'maxiter', 'random_seed'} ''' # 1. if "load data" is in the kwargs and is True, and the path is also given, the program will load data from the path # 2. else if data is given, use the given data # 3. else there is no avaliable data, so an error will be reported # The data is treated as a training set if (kwargs.get('loaddata') is not None) and (kwargs.get('path') is not None): if kwargs['loaddata']: trainset = load_data_forSP(kwargs['path']) elif training_data is not None: coo = training_data.tocoo(copy = False) ratings_sp_df = pd.DataFrame({'item': coo.row, 'user': coo.col, 'rating': coo.data})[['item', 'user', 'rating']].reset_index(drop=True) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(ratings_sp_df[['user', 'item', 'rating']], reader) trainset, testset = train_test_split(data, test_size=0.01, random_state = 0) else: sys.exit('No input data for surpries!') # train the model if name.lower() == 'svd': prediction = svd(trainset, kwargs) elif name.lower() == 'knn': prediction = KNN(trainset, kwargs) elif name.lower() == 'cluster': prediction = cluster(trainset, kwargs) else: sys.exit('The algorothm', name, 'is not supported yet.') testrmse = calculate_rmse( prediction[testing_data.nonzero()], testing_data[testing_data.nonzero()].toarray()[0] ) trainrmse = calculate_rmse( prediction[training_data.nonzero()], training_data[training_data.nonzero()].toarray()[0] ) return prediction, trainrmse, testrmse
def test_on_dataset(self, testX, testY, model): # Make predictions testPredict = model.predict(testX) # Calculate error rmse = helpers.calculate_rmse(testX, testPredict) return rmse
def blending(test): ''' This function is used to blending the predicted results of all models data is loaded from the ./dump folder The rule of naming the predicted value on full matrix and testset is as following: full matrix: number of the model + _full + else.npy test matrix: number of the model + _test + else.npy for example, 0_full_ALS.npy 0_test_ALS.npy all of these files should be stored in the ./dump directory ''' X = np.zeros((1, 10000000)) X_test = test[test.nonzero()].toarray() path = 'dump' files = os.listdir(path) for name in files: for i in range(100): if name.endswith('.npy') and name.startswith('%s' % (i)): if name.startswith('%s_full' % (i)): temp1 = np.load(os.path.join(path, name)).reshape(1, -1) X = np.concatenate((X, temp1)) elif name.startswith('%s_test' % (i)): temp2 = np.load(os.path.join(path, name)).reshape(1, -1) X_test = np.concatenate((X_test, temp2)) else: print( 'Wrong naming rule of the file! Please look the </dump> folder and check' ) X = np.delete(X, 0, 0) X_test = np.delete(X_test, 0, 0) y_test = test[test.nonzero()].toarray()[0] Num_model = X.shape[0] print('\nThe number of model is ', Num_model) # Linear blend of the previous models computed on the test set. clf = get_linear_blend_clf(X_test, y_test) print('RMSE Test: %f' % calculate_rmse(clf.predict(X_test.T), y_test)) print('Weights of the different models:', clf.coef_) # Final predicted labels matrix predicted_labels = clf.predict(X.T).reshape(10000, 1000) # Generate the CSV submission file generate_submission(predicted_labels) np.save('data/final_x.npy', predicted_labels)
def train_on_dataset(self, trainX, trainY, model): # Visualize model structure if (self.settings.getboolean("LSTM", "visualize_model") == True): plot_model(model, to_file='model.png') # Model training if (self.settings.getboolean("LSTM", "load_existing_model") == False): history = self.train_model(model, trainX, trainY) # Visualize model training if (self.settings.getboolean("LSTM", "visualize_model") == True): self.visualize_model_training(history) # Save model if (self.settings.getboolean("LSTM", "save_training_model") == True): self.save_model(model) # Make predictions trainPredict = model.predict(trainX) # trainPredict error rmse = helpers.calculate_rmse(trainX, trainPredict) return rmse
def ALS(train, test, n_features, lambda_user, lambda_item, verbose=1): """Alternating Least Squares (ALS) algorithm.""" print( '\nStarting ALS with n_features = %d, lambda_user = %f, lambda_item = %f' % (n_features, lambda_user, lambda_item)) n_epochs = 20 user_features_file_path = 'ALSdump/user_features_%s_%s_%s_%s.npy' \ % (n_epochs, n_features, lambda_user, lambda_item) item_features_file_path = 'ALSdump/item_features_%s_%s_%s_%s.npy' \ % (n_epochs, n_features, lambda_user, lambda_item) if (os.path.exists(user_features_file_path) and os.path.exists(item_features_file_path)): user_features = np.load(user_features_file_path) item_features = np.load(item_features_file_path) train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()].toarray()[0]) test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()].toarray()[0]) print("Train error: %f, test error: %f" % (train_rmse, test_rmse)) return user_features, item_features user_features, item_features = init_MF(train, n_features) nz_row, nz_col = test.nonzero() nz_test = list(zip(nz_row, nz_col)) nz_train, nz_row_colindices, nz_col_rowindices = helpers.build_index_groups( train) _, nz_user_itemindices = map(list, zip(*nz_col_rowindices)) nnz_items_per_user = [len(i) for i in nz_user_itemindices] _, nz_item_userindices = map(list, zip(*nz_row_colindices)) nnz_users_per_item = [len(i) for i in nz_item_userindices] prev_train_rmse = 100 #prev_test_rmse =100 for it in range(n_epochs): user_features = update_user_feature(train, item_features, lambda_user, nnz_items_per_user, nz_user_itemindices) item_features = update_item_feature(train, user_features, lambda_item, nnz_users_per_item, nz_item_userindices) train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()].toarray()[0]) test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()].toarray()[0]) if verbose == 1: print("[Epoch %d / %d] train error: %f, test error: %f" % (it + 1, n_epochs, train_rmse, test_rmse)) if (train_rmse > prev_train_rmse or abs(train_rmse - prev_train_rmse) < 1e-5): if verbose == 1: print('Algorithm has converged!') break #prev_test_rmse = test_rmse prev_train_rmse = train_rmse if verbose == 0: print("[Epoch %d / %d] train error: %f, test error: %f" % (it + 1, n_epochs, train_rmse, test_rmse)) np.save(user_features_file_path, user_features) np.save(item_features_file_path, item_features) return user_features, item_features
def matrix_factorization_SGD(train, test, n_features, lambda_user, lambda_item, fold_num): """matrix factorization by SGD.""" print( '--->Starting SGD with n_features = %d, lambda_user = %f, lambda_item = %f, the %d-th fold' % (n_features, lambda_user, lambda_item, fold_num)) # define parameters gamma = 0.01 n_epochs = 50 prev_train_rmse = 100 prev_train_rmse = 100 user_features_file_path = 'Learningdata/user_features_%s_%s_%s_%s_%s_%s.npy' \ % ('sgd', fold_num, n_epochs, n_features, lambda_user, lambda_item) item_features_file_path = 'Learningdata/item_features_%s_%s_%s_%s_%s_%s.npy' \ % ('sgd', fold_num, n_epochs, n_features, lambda_user, lambda_item) if (os.path.exists(user_features_file_path) and os.path.exists(item_features_file_path)): user_features = np.load(user_features_file_path) item_features = np.load(item_features_file_path) train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()][0]) test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()][0]) print(" Train error: %f, test error: %f" % (train_rmse, test_rmse)) return user_features, item_features, train_rmse, test_rmse else: try: os.mkdir('Learningdata/') print( '\n The folder, "Learningdata", is created to store learning results.\n' ) except: print('\n New data will be saved to the "Learningdata" folder. \n') # set seed np.random.seed(988) # init matrix user_features, item_features = init_MF(train, n_features) # find the non-zero ratings indices nz_row, nz_col = train.nonzero() nz_train = list(zip(nz_row, nz_col)) nz_row, nz_col = test.nonzero() for it in range(n_epochs): # shuffle the training rating indices np.random.shuffle(nz_train) # decrease step size gamma /= 1.2 for d, n in nz_train: # update W_d (item_features[:, d]) and Z_n (user_features[:, n]) item_info = item_features[d, :] user_info = user_features[:, n] err = train[d, n] - user_info @ item_info # calculate the gradient and update item_features[d, :] += gamma * (err * user_info - lambda_item * item_info) user_features[:, n] += gamma * (err * item_info - lambda_user * user_info) # evaluate the train error train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()][0]) # evaluate the test error test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()][0]) print(" [Epoch %d / %d] train error: %f, test error: %f" % (it + 1, n_epochs, train_rmse, test_rmse)) if (train_rmse > prev_train_rmse or abs(train_rmse - prev_train_rmse) < 1e-5): print('SGD Algorithm has converged!') break prev_train_rmse = train_rmse prev_test_rmse = test_rmse np.save(user_features_file_path, user_features) np.save(item_features_file_path, item_features) return user_features, item_features, prev_train_rmse, prev_test_rmse # output the final rmse
from helpers import generate_data, LinearRegression, calculate_rmse # data range and number of points l_bound = 0 r_bound = 100 n = 1000 data = generate_data(l_bound, r_bound, n) linreg = LinearRegression() linreg.fit(data) # Find regression line xx = np.linspace(l_bound, r_bound, n) yy = np.array(linreg.b[0] + linreg.b[1] * xx) # Check predictions check_data = generate_data(l_bound, r_bound, n // 10) pred_x = [[x] for x in check_data[:, 0]] actual_y = check_data[:, 1] pred_y = linreg.predict(pred_x) rmse = calculate_rmse(actual_y, pred_y) print('rmse:', rmse) plt.figure(1) plt.plot(xx, yy.T, color='tab:blue') plt.scatter(data[:, 0], data[:, 1], color='c') plt.scatter(check_data[:, 0], check_data[:, 1], color='r') plt.scatter(pred_x, pred_y, color='m') plt.show()
# Split in training and testing sets print('\nSplitting the data in train and test sets...') train, test = split_data(ratings, p_test=0.1) # Generate predictions for 6 different models X, X_train, y_train, X_test, y_test = get_ALS_predictions( ratings, train, test, n_features_array=range(1, 31), lambda_user=0.2, lambda_item=0.02 ) # Linear blend of the previous models computed on the test set. clf = get_linear_blend_clf(X_test, y_test) print('\nRMSE Train: %f' % calculate_rmse(clf.predict(X_train.T), y_train)) print('RMSE Test: %f' % calculate_rmse(clf.predict(X_test.T), y_test)) print('Weights of the different models:', clf.coef_) # Final predicted labels matrix predicted_labels = clf.predict(X.T).reshape(ratings.shape) predicted_labels[predicted_labels > 5] = 5 predicted_labels[predicted_labels < 1] = 1 # Generate the CSV submission file generate_submission(predicted_labels)
# data range and number of points l_bound = 0 r_bound = 1000 n = 1000 data = generate_data3d(l_bound, r_bound, n) linreg = LinearRegression() linreg.fit(data) # Check predictions check_data = generate_data3d(l_bound, r_bound, n) pred_x = check_data[:, :-1] actual_y = check_data[:, -1] pred_y = linreg.predict(pred_x) rmse = calculate_rmse(actual_y, pred_y) print('rmse lin:', rmse) rigreg = RidgeRegression() rigreg.fit(data, 0.01) rigpred_y = rigreg.predict(pred_x) rmse_reg = calculate_rmse(actual_y, rigpred_y) print('rmse reg:', rmse_reg) make_coef_plot(data, check_data) fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(data[:, 0], data[:, 1], data[:, 2], label='data')