Error_train_fs[k] = np.square(y_train - m.predict(
            X_train[:, selected_features])).sum() / y_train.shape[0]
        Error_test_fs[k] = np.square(y_test - m.predict(
            X_test[:, selected_features])).sum() / y_test.shape[0]

        figure(k)
        suptitle("Cross-validation fold #{}".format(k + 1),
                 fontsize=12,
                 fontweight='bold')
        subplot(1, 2, 1)
        plot(range(1, len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')

        subplot(1, 3, 3)
        bmplot(attributeNames, range(1, features_record.shape[1]),
               -features_record[:, 1:])
        clim(-1.5, 0)
        xlabel('Iteration')
        savefig(
            "project-2-forward-selection-squared-error-{}-cv-fold.png".format(
                k + 1),
            bbox_inches='tight')

    print('Cross validation fold {0}/{1}'.format(k + 1, K))
    print('Train indices: {0}'.format(train_index))
    print('Test indices: {0}'.format(test_index))
    print('Features no: {0}'.format(selected_features.size))
    print("Weights: {}\n".format(m.coef_))

    k += 1
Example #2
0
def forwardSelection(X,y,N,K,attributeNames, classNames):
    # Add offset attribute
    X2 = np.concatenate((np.ones((X.shape[0],1)),X),1)
    attributeNames2 = [u'Offset']+attributeNames
    M2 = len(attributeNames)+1
    
    
    #X3 = np.copy(X)
    X2[:,2] = np.power(X2[:,2],2)    
    
    ## Crossvalidation
    # Create crossvalidation partition for evaluation

    CV = cross_validation.KFold(N,K,shuffle=True)
    
    # Initialize variables
    Features = np.zeros((M2,K))
    Error_train = np.empty((K,1))
    Error_test = np.empty((K,1))
    Error_train_fs = np.empty((K,1))
    Error_test_fs = np.empty((K,1))
    Error_train_nofeatures = np.empty((K,1))
    Error_test_nofeatures = np.empty((K,1))
    
    k=0
    for train_index, test_index in CV:
        
        # extract training and test set for current CV fold
        X_train = X2[train_index]
        y_train = y[train_index]
        X_test = X2[test_index]
        y_test = y[test_index]
        internal_cross_validation = 5
        
        
        
        # Compute squared error without using the input data at all
        Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
        Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
        
         # Compute squared error with all features selected (no feature selection)
        m = lm.LinearRegression().fit(X_train, y_train)
        Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
        Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]


        # Compute squared error with feature subset selection
        selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
        Features[selected_features,k]=1
            # .. alternatively you could use module sklearn.feature_selection
        m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
        Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
        Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

        
        figure()
        subplot(1,2,1)
        plot(range(1,len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')    
        
        subplot(1,3,3)
        bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:])
        clim(-1.5,0)
        xlabel('Iteration')
    
        print('Cross validation fold {0}/{1}'.format(k+1,K))
    
        k+=1
    
    
    # Display results
    print('\n')
    print('Linear regression without feature selection:\n')
    print('- Training error: {0}'.format(Error_train.mean()))
    print('- Test error:     {0}'.format(Error_test.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
    print('\n')
    print('Linear regression with feature selection:\n')
    print('- Training error: {0}'.format(Error_train_fs.mean()))
    print('- Test error:     {0}'.format(Error_test_fs.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))
    
    figure()
    subplot(1,3,2)
    bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features)
    clim(-1.5,0)
    xlabel('Crossvalidation fold')
    ylabel('Attribute')
    
    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual
    f=2 # cross-validation fold to inspect
    ff=Features[:,f-1].nonzero()[0]
    m = lm.LinearRegression().fit(X2[:,ff], y)
    
    y_est= m.predict(X2[:,ff])
    residual=y-y_est
    
    figure()
    title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
    for i in range(0,len(ff)):
       subplot(2,ceil(len(ff)/2.0),i+1)
       for c in classNames:
           class_mask = (y_est==c)
           plot(X2[:,ff[i]],residual,'.')
       xlabel(attributeNames2[ff[i]])
       ylabel('residual error')
    
    
    show()    
Example #3
0
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number):
    X, y = split_train_test(input_matrix, index)
    N, M = X.shape
    K = outer_cross_number
    # CV = model_selection.KFold(K,True)

    attributeNames = [
        'MPG', 'Cylinders', 'Displacment', 'Horsepower', 'Weight (lbs)',
        'Acceleration (MPH)', 'Model year', 'Origin'
    ]
    temp = attributeNames[index]
    attributeNamesShorter = attributeNames
    attributeNamesShorter.remove(temp)

    neurons = 1
    learning_goal = 25
    max_epochs = 64
    show_error_freq = 65

    CV = cross_validation.KFold(N, K, shuffle=True)

    Features = np.zeros((M, K))
    Error_train = np.empty((K, 1))
    Error_test = np.empty((K, 1))
    Error_train_fs = np.empty((K, 1))
    Error_test_fs = np.empty((K, 1))
    Error_train_mean = np.empty((K, 1))
    Error_test_mean = np.empty((K, 1))
    Error_train_nn = np.empty((K, 1))
    Error_test_nn = np.empty((K, 1))
    k = 0
    for train_index, test_index in CV:
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        internal_cross_validation = inner_cross_number

        Error_train_mean[k] = np.square(
            y_train - y_train.mean()).sum() / y_train.shape[0]
        Error_test_mean[k] = np.square(y_test -
                                       y_test.mean()).sum() / y_test.shape[0]

        m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
        Error_train[k] = np.square(y_train -
                                   m.predict(X_train)).sum() / y_train.shape[0]
        Error_test[k] = np.square(y_test -
                                  m.predict(X_test)).sum() / y_test.shape[0]
        textout = ''
        selected_features, features_record, loss_record = feature_selector_lr(
            X_train, y_train, internal_cross_validation, display=textout)

        Features[selected_features, k] = 1
        # .. alternatively you could use module sklearn.feature_selection
        if len(selected_features) is 0:
            print(
                'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
            )
        else:
            m = lm.LinearRegression(fit_intercept=True).fit(
                X_train[:, selected_features], y_train)
            Error_train_fs[k] = np.square(y_train - m.predict(
                X_train[:, selected_features])).sum() / y_train.shape[0]
            Error_test_fs[k] = np.square(y_test - m.predict(
                X_test[:, selected_features])).sum() / y_test.shape[0]

            y_train_2 = np.asmatrix([[x] for x in y_train])
            y_test_2 = np.asmatrix([[x] for x in y_test])
            ann = nl.net.newff(
                [[-3, 3]] * M, [neurons, 1],
                [nl.trans.TanSig(), nl.trans.PureLin()])

            ann.train(X_train,
                      y_train_2,
                      goal=learning_goal,
                      epochs=max_epochs,
                      show=show_error_freq)
            y_est_train = ann.sim(X_train)
            y_est_test = ann.sim(X_test)

            Error_train_nn[k] = np.square(y_est_train -
                                          y_train_2).sum() / y_train.shape[0]
            Error_test_nn[k] = np.square(y_est_test -
                                         y_test_2).sum() / y_test.shape[0]

            figure()
            subplot(2, 1, 1)
            plot(y_train_2, y_est_train, '.')
            subplot(2, 1, 2)
            plot(y_test_2, y_est_test, '.')
            xlabel('MPG (true, normalized)')
            ylabel('MPG (estimated, normalized)')

        print('Cross validation fold {0}/{1}'.format(k + 1, K))
        print('Features no: {0}\n'.format(selected_features.size))

        k += 1

        figure(k)
        subplot(1, 2, 1)
        plot(range(1, len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')

        subplot(1, 3, 3)
        bmplot(attributeNames, range(1, features_record.shape[1]),
               -features_record[:, 1:])
        clim(-1.5, 0)
        xlabel('Iteration')

    print('Feature_select vs. ANN:')
    significant_differnece(Error_1=Error_test_fs, Error_2=Error_test_nn, K=K)
    print('Mean vs. ANN:')
    significant_differnece(Error_1=Error_test_mean, Error_2=Error_test_nn, K=K)
    print('Linear vs. ANN:')
    significant_differnece(Error_1=Error_test, Error_2=Error_test_nn, K=K)

    figure()
    plt.boxplot(
        np.bmat('Error_test_nn, Error_test_fs, Error_test, Error_train_mean'))
    title('Normalized input/output')
    xlabel('ANN vs. Feature_selected vs. clean vs. mean')
    ylabel('Mean squared error')

    show()
if addCombinations:
    plt.xticks(range(len(selected_features) + 1),
               ["", "Ca", "Si", "Al", "Ba", "K | Si"])
    plt.xlabel('iteration (attribute added)', fontsize=12)
else:
    plt.xticks(range(len(selected_features) + 1))
    plt.xlabel('iteration')

plt.ylabel('R^2 (crossvalidation)', fontsize=12)
plt.ylim(0, 1)
plt.grid('on')

if not addCombinations:
    plt.subplot(1, 3, 3)
    #Add the constant (no feature) evaluation to data
    bmplot(labels, range(features_record.shape[1]), -features_record)
    plt.clim(-1.5, 0)
    plt.xlabel('Iteration')

filename = "reg"

if addCombinations:
    filename += "_allCombi"

filename += "_Trans"

if transformMean:
    filename += "_Mean"

if transformStd:
    filename += "_Std"
    else:
        m = lm.LinearRegression(fit_intercept=True).fit(
            X_train[:, selected_features], y_train)
        Error_train_fs[k] = np.square(y_train - m.predict(
            X_train[:, selected_features])).sum() / y_train.shape[0]
        Error_test_fs[k] = np.square(y_test - m.predict(
            X_test[:, selected_features])).sum() / y_test.shape[0]

        figure(k)
        subplot(1, 2, 1)
        plot(range(1, len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')

        subplot(1, 3, 3)
        bmplot(label, range(1, features_record.shape[1]),
               -features_record[:, 1:])
        clim(-1.5, 0)
        xlabel('Iteration')

    print('Cross validation fold {0}/{1}'.format(k + 1, K))
    print('Train indices: {0}'.format(train_index))
    print('Test indices: {0}'.format(test_index))
    print('Features no: {0}\n'.format(selected_features.size))

    k += 1

# Display results
print('\n')
print('Linear regression without feature selection:\n')
print('- Training error: {0}'.format(Error_train.mean()))
print('- Test error:     {0}'.format(Error_test.mean()))
print('- R^2 train:     {0}'.format((LINEAR_ERROR_TRAIN_NOFEATURES.sum()-LINEAR_ERROR_TRAIN.sum())/LINEAR_ERROR_TRAIN_NOFEATURES.sum()))
print('- R^2 test:     {0}'.format((LINEAR_ERROR_TEST_NOFEATURES.sum()-LINEAR_ERROR_TEST.sum())/LINEAR_ERROR_TEST_NOFEATURES.sum()))
# print('- Error rate train: {0}%'.format(100*mean(LINEAR_ERROR_TRAIN)))
# print('- Error rate test: {0}%'.format(100*mean(LINEAR_ERROR_TEST)))

print('Linear regression with feature selection:\n')
print('- Training error: {0}'.format(LINEAR_ERROR_TRAIN_FS.mean()))
print('- Test error:     {0}'.format(LINEAR_ERROR_TEST_FS.mean()))
print('- R^2 train:     {0}'.format((LINEAR_ERROR_TRAIN_NOFEATURES.sum()-LINEAR_ERROR_TRAIN_FS.sum())/LINEAR_ERROR_TRAIN_NOFEATURES.sum()))
print('- R^2 test:     {0}'.format((LINEAR_ERROR_TEST_NOFEATURES.sum()-LINEAR_ERROR_TEST_FS.sum())/LINEAR_ERROR_TEST_NOFEATURES.sum()))
# print('- Error rate train: {0}%'.format(100*mean(LINEAR_ERROR_TRAIN_FS)))
# print('- Error rate test: {0}%'.format(100*mean(LINEAR_ERROR_TEST_FS)))

figure(k)
subplot(1,3,2)
bmplot(attributeNames, range(1,LINEAR_FEATURES.shape[1]+1), -LINEAR_FEATURES)
clim(-1.5,0)
xlabel('Crossvalidation fold')
ylabel('Attribute')

f=2 # cross-validation fold to inspect
ff=LINEAR_FEATURES[:,f-1].nonzero()[0]
m = lm.LinearRegression().fit(X[:,ff], y)

# print "ff: " + str(ff)
# params = attributeNames[ff]
# coefficients = m.coef_
#
# for ind in range(len(ff)):
#     print params[ind] + ": " + str(coefficients[ind])
    # Compute squared error with feature subset selection
    selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
    Features[selected_features,k]=1
    # .. alternatively you could use module sklearn.feature_selection
    m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
    Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
    Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

    figure(k)
    subplot(1,2,1)
    plot(range(1,len(loss_record)), loss_record[1:])
    xlabel('Iteration')
    ylabel('Squared error (crossvalidation)')    
    
    subplot(1,3,3)
    bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:])
    clim(-1.5,0)
    xlabel('Iteration')

    print('Cross validation fold {0}/{1}'.format(k+1,K))
    print('Train indices: {0}'.format(train_index))
    print('Test indices: {0}'.format(test_index))
    print('Features no: {0}\n'.format(selected_features.size))

    k+=1


# Display results
print('\n')
print('Linear regression without feature selection:\n')
print('- Training error: {0}'.format(Error_train.mean()))
Example #8
0
def forwardSelection(X,y,N,K,attributeNames, classNames):
    # Add offset attribute
    X2 = np.concatenate((np.ones((X.shape[0],1)),X),1)
    attributeNames2 = [u'Offset']+attributeNames
    M2 = len(attributeNames)+1
    
    
    #X3 = np.copy(X)
    X2[:,2] = np.power(X2[:,2],2)    
    
    ## Crossvalidation
    # Create crossvalidation partition for evaluation

    CV = cross_validation.KFold(N,K,shuffle=True)
    
    # Initialize variables
    Features = np.zeros((M2,K))
    Error_train = np.empty((K,1))
    Error_test = np.empty((K,1))
    Error_train_fs = np.empty((K,1))
    Error_test_fs = np.empty((K,1))
    Error_train_nofeatures = np.empty((K,1))
    Error_test_nofeatures = np.empty((K,1))
    
    k=0
    for train_index, test_index in CV:
        
        # extract training and test set for current CV fold
        X_train = X2[train_index]
        y_train = y[train_index]
        X_test = X2[test_index]
        y_test = y[test_index]
        internal_cross_validation = 5
        
        
        
        # Compute squared error without using the input data at all
        Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
        Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
        
         # Compute squared error with all features selected (no feature selection)
        m = lm.LinearRegression().fit(X_train, y_train)
        Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
        Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]


        # Compute squared error with feature subset selection
        selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
        Features[selected_features,k]=1
            # .. alternatively you could use module sklearn.feature_selection
        m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
        Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
        Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

        
        figure()
        subplot(1,2,1)
        plot(range(1,len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')    
        
        subplot(1,3,3)
        bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:])
        clim(-1.5,0)
        xlabel('Iteration')
    
        print('Cross validation fold {0}/{1}'.format(k+1,K))
    
        k+=1
    
    
    # Display results
    print('\n')
    print('Linear regression without feature selection:\n')
    print('- Training error: {0}'.format(Error_train.mean()))
    print('- Test error:     {0}'.format(Error_test.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
    print('\n')
    print('Linear regression with feature selection:\n')
    print('- Training error: {0}'.format(Error_train_fs.mean()))
    print('- Test error:     {0}'.format(Error_test_fs.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))
    
    figure()
    subplot(1,3,2)
    bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features)
    clim(-1.5,0)
    xlabel('Crossvalidation fold')
    ylabel('Attribute')
    
    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual
    f=2 # cross-validation fold to inspect
    ff=Features[:,f-1].nonzero()[0]
    m = lm.LinearRegression().fit(X2[:,ff], y)
    
    y_est= m.predict(X2[:,ff])
    residual=y-y_est
    
    figure()
    title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
    for i in range(0,len(ff)):
       subplot(2,ceil(len(ff)/2.0),i+1)
       for c in classNames:
           class_mask = (y_est==c)
           plot(X2[:,ff[i]],residual,'.')
       xlabel(attributeNames2[ff[i]])
       ylabel('residual error')
    
    
    show()