X = X.reshape(-1, 1)
y = y.reshape(-1, 1)
# creating model
model = LinearRegression()
# Fitting training data
model.fit(X, y)
# Printing theta0 and theta1
# intercept_ --> theta0 and coef_ --> list of [theta1, theta2, ...]
print(model.intercept_, model.coef_)
# Coeff of determination for training data
print(model.score(X, y))
# predicting for x = 3.5 and x = 7.0
print(model.predict(np.array([3.5, 7.0]).reshape(-1, 1)))

# Multi-feature training set
fname = join(HOME, path, './multi-feature.txt')
X1, X2, y = np.loadtxt(fname, delimiter=',', unpack=True)
# X = np.hstack((X1, X2))
X = np.c_[X1, X2]
X = X.reshape(-1, 2)
y = y.reshape(-1, 1)
model.normalize = True
model.fit(X, y)
print(model.intercept_, model.coef_, model.score(X, y))
print(model.predict(np.array([1650, 3]).reshape(-1, 2)))

# Using Polynomial features
X_ = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X)
model.fit(X_, y)
print(model.intercept_, model.coef_, model.score(X_, y))
Ejemplo n.º 2
0
def linearRegression(dataframe_with_efficacy,
                     efficacy=True,
                     normalize=False,
                     include_position=False,
                     testsize=0.25):

    df = dataframe_with_efficacy.copy()

    df = df.drop(columns=['rep', 'field_id'])

    if include_position:
        pass
    else:
        df = df.drop(columns=['row', 'range'])

    df['sym'] = df['sym'].astype('category')
    df = pd.get_dummies(df)
    df = df.loc[df['sym_ctrl'] == False]
    df = df.drop(columns='sym_ctrl')

    if efficacy:
        target = 'efficacy_in_percent'
        df = df.drop(columns=['value'])
    else:
        target = 'value'
        df = df.drop(columns=['efficacy_in_percent'])

    features = df.drop(columns=target).columns

    # Make training set. Leave out 25% data for testing.

    # create multiple linear regression object
    mlr = LinearRegression(fit_intercept=True)

    # Whether or not to normalize:
    mlr.normalize = normalize

    # Separate into 75% train and 25% test:
    # Test size is default 25%
    x_train, x_test, y_train, y_test = train_test_split(df[features],
                                                        df[target],
                                                        test_size=testsize,
                                                        shuffle=True)

    # fit linear regression
    mlr.fit(x_train, y_train)

    # get the slope and intercept of the line best fit.
    # print(mlr.intercept_)

    print('features in order of decreasing value of coeficients:')
    print('feature: coefficient value; target: ', target)
    print('--------------------------')

    sorted_idx = np.argsort(mlr.coef_)
    f = features[sorted_idx[::-1]]
    c = mlr.coef_[sorted_idx[::-1]]
    for cidx, ff in enumerate(f):
        print(ff, ": ", c[cidx])

    # Run the model on the test set, plot the comparison.
    y_prediction = mlr.predict(x_test)
    rmse_model = (np.mean(y_prediction - y_test)**2)**0.5

    cv4 = cross_val_score(mlr,
                          df[features],
                          df[target],
                          cv=4,
                          scoring="neg_mean_squared_error")
    rmse_model_cross_fold_4 = (np.mean(np.sign(cv4) * cv4))**0.5

    print('')
    print('')
    print('--------------------------------------')
    print('rmse_for_cross_val_four_times: ',
          np.round(rmse_model_cross_fold_4, 2))
    print('--------------------------------------')
    print('')
    print('')
    print('')
    print('Comparing predicted vs truth value in dataset')
    print('---------------------------------------------------------')

    fig = plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_prediction)
    plt.title('predicted vs test: RMSE = %s [units]' %
              (np.round(rmse_model, 2)))
    plt.ylabel('predicted value for %s' % (target))
    plt.xlabel('test value for %s' % (target))

    return mlr