def test_linear(eng):
    X = randn(10, 2)
    y = fromarray(randn(10, 4).T, engine=eng)

    truth = asarray(fit_models(LR, X, y))
    betas = LinearRegression().fit(X, y).betas.toarray()
    assert allclose(truth, betas)

    truth = asarray(fit_models(LR, X, y, fit_intercept=False))
    betas = LinearRegression(fit_intercept=False).fit(X, y).betas.toarray()
    assert allclose(truth, betas)
Exemple #2
0
 def __build_model(self, var_idx, method='forward'):
     linear_reg = LinearRegression(gradient_descent=False)
     if method == 'forward':
         candidate_features = self.__best_features + [var_idx]
     elif method == 'backward':
         candidate_features = deepcopy(self.__best_features)
         candidate_features.remove(var_idx)
     X = self.X[:, candidate_features]
     linear_reg.fit(X, self.y)
     y_preds = [linear_reg.predict(x) for x in X]
     return calculate_r2(self.y, y_preds)
def test_predict_and_score(eng):
	X = randn(10, 2)
	y = fromarray(randn(10, 4).T, engine=eng)

	model = LinearRegression().fit(X, y)

	yhat = model.predict(X).toarray()
	rsq = model.score(X, y).toarray()
	truth = hstack([yhat, rsq[:, newaxis]])

	result = model.predict_and_score(X, y).toarray()

	assert allclose(truth, result)
def test_predict_and_score(eng):
    X = randn(10, 2)
    y = fromarray(randn(10, 4).T, engine=eng)

    model = LinearRegression().fit(X, y)

    yhat = model.predict(X).toarray()
    rsq = model.score(X, y).toarray()
    truth = hstack([yhat, rsq[:, newaxis]])

    result = model.predict_and_score(X, y).toarray()

    assert allclose(truth, result)
def test_predict(eng):
    X = randn(10, 2)
    y = fromarray(randn(10, 4).T, engine=eng)

    truth = asarray(predict_models(LR, X, y))
    predictions = LinearRegression().fit(X, y).predict(X).toarray()
    assert allclose(truth, predictions)
def test_score(eng):
    X = randn(10, 2)
    y = fromarray(randn(10, 4).T, engine=eng)

    truth = asarray(score_models(LR, X, y))
    scores = LinearRegression().fit(X, y).score(X, y).toarray()
    assert allclose(truth, scores)
def single_linear_regression_model(single_linear_regression_data):
    linear_regression_model = LinearRegression(
        independent_vars=single_linear_regression_data["independent_vars"],
        dependent_var=single_linear_regression_data["dependent_var"],
        iterations=10000,
        learning_rate=0.001,
        train_split=0.7,
        seed=123,
    )
    return linear_regression_model
def test_betas_and_scores(eng):
    X = randn(10, 2)
    y = fromarray(randn(10, 4).T, engine=eng)

    true_betas = asarray(fit_models(LR, X, y))
    true_scores = asarray(score_models(LR, X, y))
    truth = hstack([true_betas, true_scores[:, newaxis]])

    result = LinearRegression().fit(X, y).betas_and_scores.toarray()

    assert allclose(truth, result)
 def build_tree(x_data, y_data):
     feature, split_val = best_split(x_data, y_data)
     node = TreeNode(feature, split_val)
     if feature is None:
         node.model = LinearRegression()
         node.model.fit(x_data, y_data)
     else:
         idx = (x_data[:, feature] <= split_val)
         node.left = build_tree(x_data[idx], y_data[idx])
         node.right = build_tree(x_data[~idx], y_data[~idx])
     return node
Exemple #10
0
 def build_models(self, df):
     self.n, self.p = df.shape
     performances = []
     for k in range(1, self.p):
         for var_combo in itertools.combinations(df.columns[:-1], k):
             linear_reg = LinearRegression()
             X = np.asarray(df[list(var_combo)])
             self.y = np.asarray(df.iloc[:, -1])
             linear_reg.fit(X, self.y)
             y_preds = [linear_reg.predict(x) for x in X]
             adj_r2, aic, bic, r2, rss = self.__calculate_criterions(
                 y_preds, k)
             performance = [var_combo, k, aic, bic, rss, r2, adj_r2]
             performances.append(performance)
             col_names = [
                 'subset', 'num_of_variables', 'aic', 'bic', 'rss', 'r2',
                 'adj_r2'
             ]
             self.models_summary = pd.DataFrame(performances,
                                                columns=col_names)
     self.__visualize_best_subset_performance()
 def linear_regression (self):
     data_preprocessor = DataPreProcessor(self.df_variable, self.pd_variable) # define preprocess.DataPreProcessor instance
     data_preprocessor_for_task = DataPreProcessorForTask(data_preprocessor) # perform preprocessing using preprocess.DataPreProcessorForTask using preprocess.DataPreProcessor instance
     data_preprocessor_for_task.preprocess_for_linear_regression() # perform preprocessing for Linear Regression
     regression_pd_variable = data_preprocessor.pd_variable
     regression_df_variable = data_preprocessor.df_variable
     
     linear_regression = LinearRegression(regression_df_variable) # define instance of regression.LinearRegression on preproccessed DF
     linear_regression.regression() # run regression
     coefficients = linear_regression.coefficients # regression retrieve its coefficients result 
     intercept = linear_regression.intercept # regression retrieve its intercept result 
     liner_equation = linear_regression.liner_equation # regression retrieve its linear equation result 
     mean_squared_error = linear_regression.mean_squared_error # regression retrieve its MSE result 
     
     messagebox.showinfo("liner_equation", liner_equation) # show linear equation on message box
     messagebox.showinfo("mean_squared_error", mean_squared_error) # show regression MSE on message box
     
     print (coefficients)
     print (intercept)
     print (liner_equation)
     print (mean_squared_error)
Exemple #12
0
    def run(self):
        x, y = self.__readData()
        model = LinearRegression()
        model.fit(x, y)
        y_predicted = [model.sumForRow(row) for row in x]

        # this plot is just to make sure we get values of a linear function
        plt.scatter(y, y_predicted, c='r')
        plt.show()

        mean_error = model.error(y_predicted, y)
        return mean_error
Exemple #13
0
def regression():
    #LinearRegression
    resultString = ''
    input1 = ''
    input2 = ''
    output1 = ''

    liner_regression = LinearRegression()
    print("okay")
    if request.method == "POST":
        input1 = request.form.get('input1')
        print("input1", input1)
        input_x_list = liner_regression.regression_string_2_list(input1)
        print("input_x_list", input_x_list)

        input2 = request.form.get('input2')
        print("input2", input2)
        input_y_list = liner_regression.regression_string_2_list(input2)
        print("input_y_list", input_y_list)

        liner_regression.model(input_x_list, input_y_list)


        output1 = request.form.get('output1')
        print("output1", output1)
        predict_x_list = liner_regression.regression_string_2_list(output1)
        print("predict_x_list", predict_x_list)


        predict_y_list = liner_regression.predict(predict_x_list)
        print("predict_y_list", predict_y_list)
        
        resultString = ''
        for i in range(len(predict_y_list)):
            resultString += str(predict_y_list[i]) + ', '
        print("resultString", resultString)    
        return render_template('regression.html', title = 'Regression', resultString = resultString, input1 = input1,input2 = input2, output1 =output1) # customize the title


    return render_template('regression.html', title = 'Regression', resultString = resultString, input1 = input1,input2 = input2, output1 =output1) # customize the title
Exemple #14
0
def main():
    X, y = make_regression(n_samples=100, n_features=1, noise=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    n_samples, n_features = np.shape(X)

    model = LinearRegression(n_iterations=100)

    model.fit(X_train, y_train)

    # Training error plot
    n = len(model.training_errors)
    training, = plt.plot(range(n), model.training_errors, label="Training Error")
    plt.legend(handles=[training])
    plt.title("Error Plot")
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Iterations')
    plt.show()

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print ("Mean squared error: %s" % (mse))

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    plt.plot(366 * X, y_pred_line, color='b', linewidth=2, label="Prediction")
    plt.suptitle("Linear Regression")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right')
    plt.show()
def main():
    X, y = make_regression(n_samples=100, n_features=1, noise=20)

    x_train, x_test, y_train, y_test = DataManipulation().train_test_split(
        X, y, test_size=0.4)
    n_samples, n_features = X.shape
    model = LinearRegression()
    model.fit(x_train, y_train)

    n = len(model.errors)
    training = plt.plot(range(n), model.errors, label='Training Errors')
    plt.title('Error plot')
    plt.xlabel('Iteration')
    plt.ylabel('Mean Squared Error')
    plt.show()

    y_pred = model.predict(x_test)

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    # Plot the results
    m1 = plt.scatter(366 * x_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * x_test, y_test, color=cmap(0.5), s=10)
    plt.plot(366 * X,
             y_pred_line,
             color='black',
             linewidth=2,
             label="Prediction")
    plt.suptitle("Linear Regression")
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right')
    plt.show()
Exemple #16
0
data = pd.read_csv('house.csv')

max = data['size'].max()

data['size'] = data['size'].apply(lambda x: x / max)

bedroomMax = data['bedroom'].max()

data['bedroom'] = data['bedroom'].apply(lambda x: x / bedroomMax)

size = data['size'].values

bedroom = data['bedroom'].values

price = data['price'].values

X = np.array([np.ones(len(size)), size, bedroom]).T

Y = np.array(price)

regression = LinearRegression(alpha=0.006, iteration=1000, feature_count=2)

regression.fit(X, Y)

regression.plot()

while (True):
    size = int(input("Enter size of house:"))
    bedroom = int(input("Enter number of bedroom:"))
    print("price:", int(regression.predict(np.array([1, size, bedroom]))))
Exemple #17
0
 def __get_full_model_r2(self):
     linear_reg = LinearRegression(gradient_descent=False)
     linear_reg.fit(self.X, self.y)
     y_preds = [linear_reg.predict(x) for x in self.X]
     return calculate_r2(self.y, y_preds)
max = data['floor'].max()

data['floor'] = data['floor'].apply(lambda x: x / max)

max = data['top_floor'].max()

data['top_floor'] = data['top_floor'].apply(lambda x: x / max)

price = data['price'].values

X = np.array([np.ones(len(price)), data['size'].values, data['room'].values, data['year'].values, data['floor'].values, data['top_floor'].values]).T

Y = np.array(price)

regression = LinearRegression(alpha=0.000001, iteration=300, feature_count=5)

regression.fit(X, Y)

regression.plot()

print("price:", "13800000")
print("price:", int(regression.predict(np.array([1, 45,2,1977,5,5]))))
print()
print("price:", "15333333")
print("price:", int(regression.predict(np.array([1, 40,2,1967,1,4]))))

# while (True):
#     size = int(input("Enter size of house:"))
#     bedroom = int(input("Enter number of bedroom:"))
#     print("price:", int(regression.predict(np.array([1, size, bedroom]))))
Exemple #19
0
# Fake training set for property rental with feature scaling
training_set_rent = np.array([
    [(1 - 2.5) / 5, (500 - 500) / 1000, (24000 - 50000) / 100000],
    [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000],
    [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000],
    [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000],
    [(2 - 2.5) / 5, (500 - 500) / 1000, (32000 - 50000) / 100000],
    [(2 - 2.5) / 5, (1000 - 500) / 1000, (29000 - 50000) / 100000],
    [(3 - 2.5) / 5, (500 - 500) / 1000, (40000 - 50000) / 100000],
    [(1 - 2.5) / 5, (500 - 500) / 1000, (24000 - 50000) / 100000],
    [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000],
    [(2 - 2.5) / 5, (500 - 500) / 1000, (32000 - 50000) / 100000],
    [(2 - 2.5) / 5, (1000 - 500) / 1000, (29000 - 50000) / 100000],
    [(3 - 2.5) / 5, (500 - 500) / 1000, (40000 - 50000) / 100000],
])

# Create and train LR with gradient
linear = LinearRegression(training_set_rent)
linear.train_gradient()

linear.show_info()

# Try to predict something
print(linear.hypothesis([(1 - 2.5) / 5, (1000 - 500) / 1000]) * 100000 + 50000)

# Create and train LR with normal equation
linear_with_normal = LinearRegression(training_set_rent)
linear_with_normal.normal_equation()

linear_with_normal.show_info()
Exemple #20
0
def main():
    #from sklearn.datasets import load_boston
    #boston = load_boston()
    #print(boston.data.shape)
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO)
    dataFile = "data/housing.data"

    col_names = ["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", "medv"]
    train_df = pd.read_csv(dataFile, names = col_names, delim_whitespace = True)
    test_df = train_df.iloc[::7, :]
    train_df.drop(train_df.index[::7], inplace=True)

    train_df_features = train_df.iloc[:, :-1]
    train_df_targets = train_df.iloc[:, -1]
    test_df_features = test_df.iloc[:, :-1]
    test_df_targets = test_df.iloc[:, -1]

    # Data analysis
    print("Data analysis:")
    print("No. of attributes: ", len(train_df.iloc[0]))
    print("No. of features usable for classifcation: ", len(train_df.iloc[0])-1)
    print("Size of training data: ", len(train_df))
    print("Size of testing data: ", len(test_df))
    print("Histogram of attributes will be shown at the end of generating all results")

    print("\nPearson correlations:")
    target_col = col_names[-1]
    for col in col_names:
        if col.lower() == 'chas': # categorical. Also, see dtypes
            continue
        print("Correlation of %s with target(%s): %f" % (col, target_col, train_df[[col, target_col]].corr(method='pearson').iloc[0,1]))

    normalizer = DataFrameStdNormalizer(train_df_features)
    train_df_features_normalized = normalizer.get_normalized_data(train_df_features)
    test_df_features_normalized = normalizer.get_normalized_data(test_df_features)

    print("\n*********************Linear Regression*******************")
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    regmodel.train(train_df_features_normalized, train_df_targets)
    trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features_normalized, test_df_targets))

    print("\n***********Ridge regression with lambda 0.01m 0.1, 1.0***************")
    for lambdaval in (0.01, 0.1, 1.0):
        regmodel = RidgeRegression(lambdaval)
        eval = ModelEvaluator(regmodel)
        regmodel.train(train_df_features_normalized, train_df_targets)
        trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
        testingError = eval.mean_squared_error(test_df_features_normalized, test_df_targets)
        print("Ridge regression model with lambda = %f" % lambdaval)
        print("Mean squared error on training data = %f" % trainingError)
        print("Mean squared error on test data = %f" % testingError)
        print("")

    print("\n*********************Cross Validation*******************")
    lambdaval = float(10.0)

    # Shuffle data
    shuffled_train_df = train_df.reindex(np.random.permutation(train_df.index))
    shuffled_train_df_features = train_df.iloc[:, :-1]
    shuffled_train_df_targets = train_df.iloc[:, -1]
    shuffled_train_df_features_normalized = (DataFrameStdNormalizer(shuffled_train_df_features)).get_normalized_data(shuffled_train_df_features)

    lambda_error_map = {}
    for i in range(0,6):
        lambdaval = float(10.0) / (10**i)
        # cross validation
        mean_cv_error = 0
        regmodel = RidgeRegression(lambdaval)
        eval = ModelEvaluator(regmodel)
        for i in range(0,10):
            chunksize = len(train_df)/10
            test_df_cv = None
            train_df_cv_targets = None
            test_df_cv = None
            test_df_cv_targets = None
            test_df_cv = shuffled_train_df_features_normalized.iloc[i*chunksize:i*chunksize+chunksize]
            test_df_cv_targets = shuffled_train_df_targets.iloc[i*chunksize:i*chunksize+chunksize]

            train_df_cv = shuffled_train_df_features_normalized.drop(shuffled_train_df_features_normalized.index[i*chunksize:i*chunksize+chunksize])
            train_df_cv_targets = shuffled_train_df_targets.drop(shuffled_train_df_targets.index[i*chunksize:i*chunksize+chunksize])
            regmodel.train(train_df_cv, train_df_cv_targets)
            #print(eval.mean_squared_error(test_df_cv, test_df_cv_targets))
            mean_cv_error += eval.mean_squared_error(test_df_cv, test_df_cv_targets)
        mean_cv_error /= 10
        print("MSE for lambda %f = %f" % (lambdaval, mean_cv_error))
        lambda_error_map[lambdaval] = mean_cv_error

    lambdabest = min(lambda_error_map, key=lambda_error_map.get)
    print("Lowest MSE for lambda = %f" % lambdabest)
    regmodel = RidgeRegression(lambdabest)
    regmodel.train(train_df_features_normalized, train_df_targets)
    eval = ModelEvaluator(regmodel)
    test_meansquarederror = eval.mean_squared_error(test_df_features_normalized, test_df_targets)
    print("Test error for model with lambda %f = %f" % (lambdabest, test_meansquarederror))
    print("")

    print("\n*********************Feature Selection*******************")
    print("*********************i. Max correlation*******************")
    target_col = col_names[-1]
    corr = {}
    for col in col_names:
        if col.lower() == 'chas': # categorical. Also, see dtypes
            continue
        corr[col] = abs(train_df[[col, target_col]].corr(method='pearson').iloc[0,1])
    maxcorrcols = heapq.nlargest(5, corr, key=corr.get)[1:]
    print("Selecting the following coluns with max correlation: ")
    print(maxcorrcols)
    train_df_features_normalized_maxcorr = train_df_features[maxcorrcols]
    regmodel = LinearRegression()
    regmodel.train(train_df_features_normalized_maxcorr, train_df_targets)
    eval = ModelEvaluator(regmodel)
    trainingError = eval.mean_squared_error(train_df_features[maxcorrcols], train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[maxcorrcols], test_df_targets))

    print("*******************ii. Max correlation with residue*****************")
    residue = train_df_targets.copy(deep=True)
    cols = []
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    for i in range(0, 4):
        corr = {}
        for col in col_names:
            if col.lower() in ('medv', 'chas') or col in cols: # categorical. Also, see dtypes
                continue
            # corr[col] = train_df[[col]].corrwith(residue).iloc[0]
            corr[col] = abs(pd.concat([train_df[[col]], residue], axis = 1).corr(method='pearson').iloc[0,1])
        maxcorrcol = max(corr, key=corr.get)
        cols.append(maxcorrcol)
        print("Taking cols: %s" % maxcorrcol)
        regmodel.train(train_df_features[cols], train_df_targets)
        for i in range(0,len(residue)):
            residue.at[residue.index[i]] = train_df_targets.iloc[i] - regmodel.predict(train_df_features[cols].iloc[i])
        #trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
        #print("Mean squared error on training data: %f" % trainingError)
        #print(cols)
        print("Mean squared error on train data: %f" % eval.mean_squared_error(train_df_features[cols], train_df_targets))
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[cols], test_df_targets))

    print("*********************iii. All 4 feature combinations*******************")
    bestcols = None
    besttrainmse = 999999
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 4)):
        regmodel.train(train_df_features_normalized[cols], train_df_targets)
        mse_train = eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets)
        #print("Mean squared error on train data: %f" % )
        #print("Mean squared error on test data: %f" % eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets))
        if mse_train < besttrainmse:
            bestcols = cols
            besttrainmse = mse_train
    print("Best training MSE = %f for columns:" % besttrainmse)
    print(bestcols)
    regmodel.train(train_df_features_normalized[bestcols], train_df_targets)
    print("Testing MSE of this model: %f" % eval.mean_squared_error(test_df_features_normalized[cols], test_df_targets))

    print("\n*********************Feature Expansion*******************")
    df_train_featuregen = train_df_features_normalized.copy(deep=True)
    df_test_featuregen = test_df_features_normalized.copy(deep=True)
    #i = 0
    for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 2)) + [[col,col] for col in train_df_features_normalized.columns]:
        #i += 1
        #print("Gen %d: %s" % (i,cols[0]+cols[1]))
        #df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1)
        df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen[cols[0]]*df_train_featuregen[cols[1]]
        df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen[cols[0]]*df_test_featuregen[cols[1]]
        #df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1)
    regmodel = LinearRegression()
    regmodel.train(df_train_featuregen, train_df_targets)
    eval = ModelEvaluator(regmodel)
    trainingError = eval.mean_squared_error(df_train_featuregen, train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(df_test_featuregen, test_df_targets))

    print("\n******************************** Showing histogram of attributes********************************")
    Histogrammer.plot_histgram_of_features(train_df, 3, 5)
    print("\nClose window to terminate")
    #plt.show(block=False) #.draw()
    #plt.pause(0.001)
    #raw_input("Press enter to continue")
    plt.show()
    return
Exemple #21
0
 def test_simple_linear_regression(self):
     self._run_simple_linear_regression_for_model(LinearRegression())
Exemple #22
0
def test_linear_regression():
    # define sample data
    xs = np.linspace(0, 2, 50).astype(np.float32)[:, np.newaxis]
    noise = np.random.normal(0, 0.5, xs.shape).astype(np.float32)
    ys = xs + noise
    # define figure
    fig = plt.figure('linear regression')
    # show the sample data
    ax = fig.add_subplot(1, 1, 1, title='linear regression')
    ax.scatter(xs, ys, c='r', label='sample points')
    # show real curve
    _ys = xs
    ax.plot(xs, _ys, 'g', label='real curve')
    # define linear regression operation
    sgd_optimizer = GradientDescentOptimizer(0.008, 1)
    linear_regression = LinearRegression(xs, ys, sgd_optimizer)
    # start regression
    ax.plot(xs, linear_regression.predict(xs), 'b', label='regression curve')
    plt.legend(loc='upper left')
    plt.ion()
    for step in range(50):
        plt.pause(0.01)
        linear_regression.regress_once()
        ax.lines.pop()
        ax.plot(xs,
                linear_regression.predict(xs),
                'b',
                label='regression curve')
        print('step {}: loss = {}'.format(step,
                                          linear_regression.compute_loss()))

    print('\n', '-' * 20, 'optimize end', '-' * 20, '\n')

    print('weights : \n{}'.format(linear_regression.weights))
    print('loss : \n{}'.format(linear_regression.compute_loss()))
    print('-' * 50, '\n')

    print('compute directly: ')
    linear_regression.weights = LinearRegression.compute_weights(xs, ys)
    print('weights: \n{}'.format(linear_regression.weights))
    print('loss: \n{}'.format(linear_regression.compute_loss()))
    print('-' * 50, '\n')

    print('real value: ')
    linear_regression.weights = np.array([1, 0], dtype=np.float32)[:,
                                                                   np.newaxis]
    print('weights: \n{}'.format(
        np.array([1, 0], dtype=np.float32)[:, np.newaxis]))
    print('loss : \n{}'.format(linear_regression.compute_loss()))
    print('-' * 50, '\n')

    ax.plot(xs, linear_regression.predict(xs), 'g', label='real curve')

    plt.ioff()
    plt.savefig('linear regression.png')
    plt.show()
"""
  @author Victor I. Afolabi
  A.I. Engineer & Software developer
  [email protected]
  Created on 26 August, 2017 @ 9:33 PM.
  Copyright (c) 2017. victor. All rights reserved.
"""

# Create a LinearRegression object
from regression import LinearRegression
import numpy as np

data = np.genfromtxt('data.csv', delimiter=',')
num_iter = 1000

clf = LinearRegression(learning_rate=1e-4)
clf.fit(data=data, num_iter=num_iter)
print('After {:,} iterations. m = {:.2f} and b = {:.2f}'.format(num_iter, clf.m, clf.b))
def linear_solve(x_data, y_data):
    model = LinearRegression()
    model.fit(x_data, y_data)
    return model.predict(x_data)
import pandas as pd
import numpy as np
from regression import LinearRegression

df = pd.read_csv("/Users/yliang/data/trunk1/spark/assembly/target/tmp/LinearRegressionSuite/datasetWithDenseFeature2/part-00000", header = None)
X = np.array(df[df.columns[1:3]])
y = np.array(df[df.columns[0]])
lir = LinearRegression(fit_intercept=True, alpha=2.3, max_iter=100, tol=1e-06, standardization=False,
					   lower_bound=[-np.inf, 6.0, -np.inf], upper_bound=[0.0, 10.0, np.inf])
lir.fit(X, y)
print("coefficients = " + str(lir.coef_))
print("intercept = " + str(lir.intercept_))
def __test_bootstrap_fit():
        # A small implementation of a test case
    from regression import LinearRegression

    N_bs = 1000

    # Initial values
    n = 200
    noise = 0.2
    np.random.seed(1234)
    test_percent = 0.35

    # Sets up random matrices
    x = np.random.rand(n, 1)

    def func_excact(_x): return 2*_x*_x + np.exp(-2*_x) + noise * \
        np.random.randn(_x.shape[0], _x.shape[1])

    y = func_excact(x)

    def design_matrix(_x):
        return np.c_[np.ones(_x.shape), _x, _x*_x]

    # Sets up design matrix
    X = design_matrix(x)

    # Performs regression
    reg = LinearRegression()
    reg.fit(X, y)
    y = y.ravel()
    y_predict = reg.predict(X).ravel()
    print("Regular linear regression")
    print("R2:  {:-20.16f}".format(reg.score(y_predict, y)))
    print("MSE: {:-20.16f}".format(metrics.mse(y, y_predict)))
    print("Beta:      ", reg.coef_.ravel())
    print("var(Beta): ", reg.coef_var.ravel())
    print("")

    # Performs a bootstrap
    print("Bootstrapping")
    bs_reg = BootstrapRegression(x, y, LinearRegression, design_matrix)
    bs_reg.bootstrap(N_bs, test_percent=test_percent)

    print("R2:    {:-20.16f}".format(bs_reg.R2))
    print("MSE:   {:-20.16f}".format(bs_reg.MSE))
    print("Bias^2:{:-20.16f}".format(bs_reg.bias))
    print("Var(y):{:-20.16f}".format(bs_reg.var))
    print("Beta:      ", bs_reg.coef_.ravel())
    print("var(Beta): ", bs_reg.coef_var.ravel())
    print("MSE = Bias^2 + Var(y) = ")
    print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias, bs_reg.var,
                                     bs_reg.bias + bs_reg.var))
    print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var - bs_reg.MSE)))

    import matplotlib.pyplot as plt
    plt.plot(x.ravel(), y, "o", label="Data")
    plt.plot(x.ravel(), y_predict, "o", 
        label=r"Pred, R^2={:.4f}".format(reg.score(y_predict, y)))
    print (bs_reg.y_pred.shape, bs_reg.y_pred_var.shape)
    plt.errorbar(bs_reg.x_pred_test, bs_reg.y_pred, 
        yerr=np.sqrt(bs_reg.y_pred_var), fmt="o", 
        label=r"Bootstrap Prediction, $R^2={:.4f}$".format(bs_reg.R2))
    plt.xlabel(r"$x$")
    plt.ylabel(r"$y$")
    plt.title(r"$2x^2 + \sigma^2$")
    plt.legend()
    plt.show()
Exemple #27
0
def __test_cross_validation_methods():
    # A small implementation of a test case
    from regression import LinearRegression
    import matplotlib.pyplot as plt

    # Initial values
    n = 100
    N_bs = 1000
    k_splits = 4
    test_percent = 0.2
    noise = 0.3
    np.random.seed(1234)

    # Sets up random matrices
    x = np.random.rand(n, 1)

    def func_excact(_x):        return 2*_x*_x + np.exp(-2*_x) + noise * \
np.random.randn(_x.shape[0], _x.shape[1])

    y = func_excact(x)

    def design_matrix(_x):
        return np.c_[np.ones(_x.shape), _x, _x * _x]

    # Sets up design matrix
    X = design_matrix(x)

    # Performs regression
    reg = LinearRegression()
    reg.fit(X, y)
    y = y.ravel()
    y_predict = reg.predict(X).ravel()
    print("Regular linear regression")
    print("R2:    {:-20.16f}".format(reg.score(y, y_predict)))
    print("MSE:   {:-20.16f}".format(metrics.mse(y, y_predict)))
    # print (metrics.bias(y, y_predict))
    print("Bias^2:{:-20.16f}".format(metrics.bias2(y, y_predict)))

    # Small plotter
    import matplotlib.pyplot as plt
    plt.plot(x, y, "o", label="data")
    plt.plot(x,
             y_predict,
             "o",
             label=r"Pred, $R^2={:.4f}$".format(reg.score(y, y_predict)))

    print("k-fold Cross Validation")
    kfcv = kFoldCrossValidation(x, y, LinearRegression, design_matrix)
    kfcv.cross_validate(k_splits=k_fold_size, test_percent=test_percent)
    print("R2:    {:-20.16f}".format(kfcv.R2))
    print("MSE:   {:-20.16f}".format(kfcv.MSE))
    print("Bias^2:{:-20.16f}".format(kfcv.bias))
    print("Var(y):{:-20.16f}".format(kfcv.var))
    print("MSE = Bias^2 + Var(y) = ")
    print("{} = {} + {} = {}".format(kfcv.MSE, kfcv.bias, kfcv.var,
                                     kfcv.bias + kfcv.var))
    print("Diff: {}".format(abs(kfcv.bias + kfcv.var - kfcv.MSE)))

    plt.errorbar(kfcv.x_pred_test,
                 kfcv.y_pred,
                 yerr=np.sqrt(kfcv.y_pred_var),
                 fmt="o",
                 label=r"k-fold CV, $R^2={:.4f}$".format(kfcv.R2))

    print("kk Cross Validation")
    kkcv = kkFoldCrossValidation(x, y, LinearRegression, design_matrix)
    kkcv.cross_validate(k_splits=k_fold_size, test_percent=test_percent)
    print("R2:    {:-20.16f}".format(kkcv.R2))
    print("MSE:   {:-20.16f}".format(kkcv.MSE))
    print("Bias^2:{:-20.16f}".format(kkcv.bias))
    print("Var(y):{:-20.16f}".format(kkcv.var))
    print("MSE = Bias^2 + Var(y) = ")
    print("{} = {} + {} = {}".format(kkcv.MSE, kkcv.bias, kkcv.var,
                                     kkcv.bias + kkcv.var))
    print("Diff: {}".format(abs(kkcv.bias + kkcv.var - kkcv.MSE)))

    plt.errorbar(kkcv.x_pred_test.ravel(),
                 kkcv.y_pred.ravel(),
                 yerr=np.sqrt(kkcv.y_pred_var.ravel()),
                 fmt="o",
                 label=r"kk-fold CV, $R^2={:.4f}$".format(kkcv.R2))

    print("Monte Carlo Cross Validation")
    mccv = MCCrossValidation(x, y, LinearRegression, design_matrix)
    mccv.cross_validate(N_bs, k_splits=k_fold_size, test_percent=test_percent)
    print("R2:    {:-20.16f}".format(mccv.R2))
    print("MSE:   {:-20.16f}".format(mccv.MSE))
    print("Bias^2:{:-20.16f}".format(mccv.bias))
    print("Var(y):{:-20.16f}".format(mccv.var))
    print("MSE = Bias^2 + Var(y) = ")
    print("{} = {} + {} = {}".format(mccv.MSE, mccv.bias, mccv.var,
                                     mccv.bias + mccv.var))
    print("Diff: {}".format(abs(mccv.bias + mccv.var - mccv.MSE)))

    print("\nCross Validation methods tested.")

    plt.errorbar(mccv.x_pred_test,
                 mccv.y_pred,
                 yerr=np.sqrt(mccv.y_pred_var),
                 fmt="o",
                 label=r"MC CV, $R^2={:.4f}$".format(mccv.R2))

    plt.xlabel(r"$x$")
    plt.ylabel(r"$y$")
    plt.title(r"$y=2x^2$")
    plt.legend()
    plt.show()
        action="store_true")
    parser.add_argument(
        "--loo",
        help=
        "Calculate leave-one-out error. Will have an adverse effect on run-time.",
        action="store_true")
    return parser.parse_args()


if __name__ == '__main__':
    args = parseArguments()
    settings = dict()

    if args.noPlot:
        settings['plot'] = False
    else:
        settings['plot'] = True

    if args.loo:
        settings['loo'] = True
    else:
        settings['loo'] = False

    if args.type == 'linear':
        # Linear Regression
        LinearRegression(calculate_error=settings.get('loo'),
                         plot_now=settings.get('plot'))
    elif args.type == 'logistic':
        # Logistic Regression
        LogisticRegression(calculate_error=settings.get('loo'),
                           plot_now=settings.get('plot'))