Ejemplo n.º 1
0
def get_splib_predictions(name, training_data, testing_data, kwargs):
    '''
    name           :  name of the algorihm to be used. Now the KNN, Cosluster and SVD are supported.
    
    training_data  :  training data. If "load data" is set and the "path" is given in the kwargs, this data will be ignored
    
    testing_data   :  testing data use for calculating rmse of prediction.
    
    random_seed    :  random seed for SVD to train the recommendation system, default is 
    
    kwargs         :  kwargs for surpriseLib, different algorithms need different args.
                          SVD: { 'k_features',  'maxiter',  'lr_pu',  'lr_qi',  'reg_bu',  'reg_qi', 'random_seed' }
                          KNN: { 'n_neigbor' ,  'min_neigbor', 'similarity'}
                          cluster: {'user_cluster', 'item_cluster', 'maxiter', 'random_seed'}
    '''
    # 1. if "load data" is in the kwargs and is True, and the path is also given, the program will load data from the path
    # 2. else if data is given, use the given data
    # 3. else there is no avaliable data, so an error will be reported
    # The data is treated as a training set    
    if (kwargs.get('loaddata') is not None) and (kwargs.get('path') is not None):
        if kwargs['loaddata']:
            trainset = load_data_forSP(kwargs['path'])
    elif training_data is not None:
        coo = training_data.tocoo(copy = False)
        ratings_sp_df = pd.DataFrame({'item': coo.row, 'user': coo.col, 'rating': coo.data})[['item', 'user', 'rating']].reset_index(drop=True)   
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 5))
        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(ratings_sp_df[['user', 'item', 'rating']], reader)  
        trainset, testset = train_test_split(data, test_size=0.01, random_state = 0)
        
    else:
        sys.exit('No input data for surpries!')
        
    # train the model
    if name.lower() == 'svd':
        prediction =  svd(trainset, kwargs)
        
    elif name.lower() == 'knn':
        prediction =  KNN(trainset, kwargs)
        
    elif name.lower() == 'cluster':
        prediction =  cluster(trainset, kwargs)     
        
    else:
        sys.exit('The algorothm', name, 'is not supported yet.')
    
    
    testrmse  = calculate_rmse( prediction[testing_data.nonzero()], testing_data[testing_data.nonzero()].toarray()[0] )
    trainrmse = calculate_rmse( prediction[training_data.nonzero()], training_data[training_data.nonzero()].toarray()[0] )
    
    return prediction, trainrmse, testrmse
    
Ejemplo n.º 2
0
    def test_on_dataset(self, testX, testY, model):
        # Make predictions
        testPredict = model.predict(testX)

        # Calculate error
        rmse = helpers.calculate_rmse(testX, testPredict)

        return rmse
Ejemplo n.º 3
0
def blending(test):
    '''
    This function is used to blending the predicted results of all models
    data is loaded from the ./dump folder
    
    The rule of naming the predicted value on full matrix and testset is as following:
    full matrix:  number of the model + _full + else.npy
    test matrix:  number of the model + _test + else.npy
    for example,  0_full_ALS.npy
                  0_test_ALS.npy
    all of these files should be stored in the ./dump directory
    '''
    X = np.zeros((1, 10000000))
    X_test = test[test.nonzero()].toarray()
    path = 'dump'
    files = os.listdir(path)
    for name in files:
        for i in range(100):
            if name.endswith('.npy') and name.startswith('%s' % (i)):
                if name.startswith('%s_full' % (i)):
                    temp1 = np.load(os.path.join(path, name)).reshape(1, -1)
                    X = np.concatenate((X, temp1))
                elif name.startswith('%s_test' % (i)):
                    temp2 = np.load(os.path.join(path, name)).reshape(1, -1)
                    X_test = np.concatenate((X_test, temp2))
                else:
                    print(
                        'Wrong naming rule of the file!                           Please look the </dump> folder and check'
                    )
    X = np.delete(X, 0, 0)
    X_test = np.delete(X_test, 0, 0)
    y_test = test[test.nonzero()].toarray()[0]
    Num_model = X.shape[0]
    print('\nThe number of model is ', Num_model)

    # Linear blend of the previous models computed on the test set.
    clf = get_linear_blend_clf(X_test, y_test)
    print('RMSE Test: %f' % calculate_rmse(clf.predict(X_test.T), y_test))
    print('Weights of the different models:', clf.coef_)

    # Final predicted labels matrix
    predicted_labels = clf.predict(X.T).reshape(10000, 1000)
    # Generate the CSV submission file
    generate_submission(predicted_labels)
    np.save('data/final_x.npy', predicted_labels)
Ejemplo n.º 4
0
    def train_on_dataset(self, trainX, trainY, model):
        # Visualize model structure
        if (self.settings.getboolean("LSTM", "visualize_model") == True):
            plot_model(model, to_file='model.png')

        # Model training
        if (self.settings.getboolean("LSTM", "load_existing_model") == False):
            history = self.train_model(model, trainX, trainY)

            # Visualize model training
            if (self.settings.getboolean("LSTM", "visualize_model") == True):
                self.visualize_model_training(history)

            # Save model
            if (self.settings.getboolean("LSTM", "save_training_model") == True):
                self.save_model(model)

        # Make predictions
        trainPredict = model.predict(trainX)
        # trainPredict error
        rmse = helpers.calculate_rmse(trainX, trainPredict)

        return rmse
Ejemplo n.º 5
0
def ALS(train, test, n_features, lambda_user, lambda_item, verbose=1):
    """Alternating Least Squares (ALS) algorithm."""
    print(
        '\nStarting ALS with n_features = %d, lambda_user = %f, lambda_item = %f'
        % (n_features, lambda_user, lambda_item))

    n_epochs = 20

    user_features_file_path = 'ALSdump/user_features_%s_%s_%s_%s.npy' \
        % (n_epochs, n_features, lambda_user, lambda_item)

    item_features_file_path = 'ALSdump/item_features_%s_%s_%s_%s.npy' \
        % (n_epochs, n_features, lambda_user, lambda_item)

    if (os.path.exists(user_features_file_path)
            and os.path.exists(item_features_file_path)):
        user_features = np.load(user_features_file_path)
        item_features = np.load(item_features_file_path)

        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()].toarray()[0])

        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()].toarray()[0])

        print("Train error: %f, test error: %f" % (train_rmse, test_rmse))

        return user_features, item_features

    user_features, item_features = init_MF(train, n_features)

    nz_row, nz_col = test.nonzero()
    nz_test = list(zip(nz_row, nz_col))
    nz_train, nz_row_colindices, nz_col_rowindices = helpers.build_index_groups(
        train)
    _, nz_user_itemindices = map(list, zip(*nz_col_rowindices))
    nnz_items_per_user = [len(i) for i in nz_user_itemindices]
    _, nz_item_userindices = map(list, zip(*nz_row_colindices))
    nnz_users_per_item = [len(i) for i in nz_item_userindices]

    prev_train_rmse = 100
    #prev_test_rmse =100
    for it in range(n_epochs):
        user_features = update_user_feature(train, item_features, lambda_user,
                                            nnz_items_per_user,
                                            nz_user_itemindices)

        item_features = update_item_feature(train, user_features, lambda_item,
                                            nnz_users_per_item,
                                            nz_item_userindices)

        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()].toarray()[0])

        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()].toarray()[0])

        if verbose == 1:
            print("[Epoch %d / %d] train error: %f, test error: %f" %
                  (it + 1, n_epochs, train_rmse, test_rmse))

        if (train_rmse > prev_train_rmse
                or abs(train_rmse - prev_train_rmse) < 1e-5):
            if verbose == 1:
                print('Algorithm has converged!')
            break
        #prev_test_rmse = test_rmse
        prev_train_rmse = train_rmse

    if verbose == 0:
        print("[Epoch %d / %d] train error: %f, test error: %f" %
              (it + 1, n_epochs, train_rmse, test_rmse))

    np.save(user_features_file_path, user_features)
    np.save(item_features_file_path, item_features)

    return user_features, item_features
Ejemplo n.º 6
0
def matrix_factorization_SGD(train, test, n_features, lambda_user, lambda_item,
                             fold_num):
    """matrix factorization by SGD."""

    print(
        '--->Starting SGD with n_features = %d, lambda_user = %f, lambda_item = %f, the %d-th fold'
        % (n_features, lambda_user, lambda_item, fold_num))

    # define parameters
    gamma = 0.01
    n_epochs = 50
    prev_train_rmse = 100
    prev_train_rmse = 100

    user_features_file_path = 'Learningdata/user_features_%s_%s_%s_%s_%s_%s.npy'  \
                                % ('sgd', fold_num, n_epochs, n_features, lambda_user, lambda_item)

    item_features_file_path = 'Learningdata/item_features_%s_%s_%s_%s_%s_%s.npy'  \
                                % ('sgd', fold_num, n_epochs, n_features, lambda_user, lambda_item)

    if (os.path.exists(user_features_file_path)
            and os.path.exists(item_features_file_path)):

        user_features = np.load(user_features_file_path)
        item_features = np.load(item_features_file_path)

        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()][0])

        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()][0])

        print("      Train error: %f, test error: %f" %
              (train_rmse, test_rmse))

        return user_features, item_features, train_rmse, test_rmse

    else:
        try:
            os.mkdir('Learningdata/')
            print(
                '\n The folder, "Learningdata", is created to store learning results.\n'
            )
        except:
            print('\n New data will be saved to the "Learningdata" folder. \n')

    # set seed
    np.random.seed(988)

    # init matrix
    user_features, item_features = init_MF(train, n_features)

    # find the non-zero ratings indices
    nz_row, nz_col = train.nonzero()
    nz_train = list(zip(nz_row, nz_col))
    nz_row, nz_col = test.nonzero()

    for it in range(n_epochs):
        # shuffle the training rating indices
        np.random.shuffle(nz_train)

        # decrease step size
        gamma /= 1.2

        for d, n in nz_train:
            # update W_d (item_features[:, d]) and Z_n (user_features[:, n])
            item_info = item_features[d, :]
            user_info = user_features[:, n]
            err = train[d, n] - user_info @ item_info

            # calculate the gradient and update
            item_features[d, :] += gamma * (err * user_info -
                                            lambda_item * item_info)
            user_features[:, n] += gamma * (err * item_info -
                                            lambda_user * user_info)

        # evaluate the train error
        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()][0])

        # evaluate the test error
        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()][0])
        print("      [Epoch %d / %d] train error: %f, test error: %f" %
              (it + 1, n_epochs, train_rmse, test_rmse))
        if (train_rmse > prev_train_rmse
                or abs(train_rmse - prev_train_rmse) < 1e-5):
            print('SGD Algorithm has converged!')
            break

        prev_train_rmse = train_rmse
        prev_test_rmse = test_rmse

    np.save(user_features_file_path, user_features)
    np.save(item_features_file_path, item_features)

    return user_features, item_features, prev_train_rmse, prev_test_rmse  # output the final rmse
Ejemplo n.º 7
0
from helpers import generate_data, LinearRegression, calculate_rmse

# data range and number of points
l_bound = 0
r_bound = 100
n = 1000
data = generate_data(l_bound, r_bound, n)

linreg = LinearRegression()
linreg.fit(data)

# Find regression line
xx = np.linspace(l_bound, r_bound, n)
yy = np.array(linreg.b[0] + linreg.b[1] * xx)

# Check predictions
check_data = generate_data(l_bound, r_bound, n // 10)
pred_x = [[x] for x in check_data[:, 0]]
actual_y = check_data[:, 1]
pred_y = linreg.predict(pred_x)

rmse = calculate_rmse(actual_y, pred_y)
print('rmse:', rmse)

plt.figure(1)
plt.plot(xx, yy.T, color='tab:blue')
plt.scatter(data[:, 0], data[:, 1], color='c')
plt.scatter(check_data[:, 0], check_data[:, 1], color='r')
plt.scatter(pred_x, pred_y, color='m')
plt.show()
Ejemplo n.º 8
0
# Split in training and testing sets
print('\nSplitting the data in train and test sets...')
train, test = split_data(ratings, p_test=0.1)

# Generate predictions for 6 different models
X, X_train, y_train, X_test, y_test = get_ALS_predictions(
    ratings,
    train,
    test,
    n_features_array=range(1, 31),
    lambda_user=0.2,
    lambda_item=0.02
)

# Linear blend of the previous models computed on the test set.
clf = get_linear_blend_clf(X_test, y_test)

print('\nRMSE Train: %f' % calculate_rmse(clf.predict(X_train.T), y_train))

print('RMSE Test: %f' % calculate_rmse(clf.predict(X_test.T), y_test))

print('Weights of the different models:', clf.coef_)

# Final predicted labels matrix
predicted_labels = clf.predict(X.T).reshape(ratings.shape)
predicted_labels[predicted_labels > 5] = 5
predicted_labels[predicted_labels < 1] = 1

# Generate the CSV submission file
generate_submission(predicted_labels)
Ejemplo n.º 9
0
# data range and number of points
l_bound = 0
r_bound = 1000
n = 1000
data = generate_data3d(l_bound, r_bound, n)

linreg = LinearRegression()
linreg.fit(data)

# Check predictions
check_data = generate_data3d(l_bound, r_bound, n)
pred_x = check_data[:, :-1]
actual_y = check_data[:, -1]
pred_y = linreg.predict(pred_x)

rmse = calculate_rmse(actual_y, pred_y)
print('rmse lin:', rmse)

rigreg = RidgeRegression()
rigreg.fit(data, 0.01)

rigpred_y = rigreg.predict(pred_x)

rmse_reg = calculate_rmse(actual_y, rigpred_y)
print('rmse reg:', rmse_reg)

make_coef_plot(data, check_data)

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(data[:, 0], data[:, 1], data[:, 2], label='data')