X = X.reshape(-1, 1) y = y.reshape(-1, 1) # creating model model = LinearRegression() # Fitting training data model.fit(X, y) # Printing theta0 and theta1 # intercept_ --> theta0 and coef_ --> list of [theta1, theta2, ...] print(model.intercept_, model.coef_) # Coeff of determination for training data print(model.score(X, y)) # predicting for x = 3.5 and x = 7.0 print(model.predict(np.array([3.5, 7.0]).reshape(-1, 1))) # Multi-feature training set fname = join(HOME, path, './multi-feature.txt') X1, X2, y = np.loadtxt(fname, delimiter=',', unpack=True) # X = np.hstack((X1, X2)) X = np.c_[X1, X2] X = X.reshape(-1, 2) y = y.reshape(-1, 1) model.normalize = True model.fit(X, y) print(model.intercept_, model.coef_, model.score(X, y)) print(model.predict(np.array([1650, 3]).reshape(-1, 2))) # Using Polynomial features X_ = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X) model.fit(X_, y) print(model.intercept_, model.coef_, model.score(X_, y))
def linearRegression(dataframe_with_efficacy, efficacy=True, normalize=False, include_position=False, testsize=0.25): df = dataframe_with_efficacy.copy() df = df.drop(columns=['rep', 'field_id']) if include_position: pass else: df = df.drop(columns=['row', 'range']) df['sym'] = df['sym'].astype('category') df = pd.get_dummies(df) df = df.loc[df['sym_ctrl'] == False] df = df.drop(columns='sym_ctrl') if efficacy: target = 'efficacy_in_percent' df = df.drop(columns=['value']) else: target = 'value' df = df.drop(columns=['efficacy_in_percent']) features = df.drop(columns=target).columns # Make training set. Leave out 25% data for testing. # create multiple linear regression object mlr = LinearRegression(fit_intercept=True) # Whether or not to normalize: mlr.normalize = normalize # Separate into 75% train and 25% test: # Test size is default 25% x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size=testsize, shuffle=True) # fit linear regression mlr.fit(x_train, y_train) # get the slope and intercept of the line best fit. # print(mlr.intercept_) print('features in order of decreasing value of coeficients:') print('feature: coefficient value; target: ', target) print('--------------------------') sorted_idx = np.argsort(mlr.coef_) f = features[sorted_idx[::-1]] c = mlr.coef_[sorted_idx[::-1]] for cidx, ff in enumerate(f): print(ff, ": ", c[cidx]) # Run the model on the test set, plot the comparison. y_prediction = mlr.predict(x_test) rmse_model = (np.mean(y_prediction - y_test)**2)**0.5 cv4 = cross_val_score(mlr, df[features], df[target], cv=4, scoring="neg_mean_squared_error") rmse_model_cross_fold_4 = (np.mean(np.sign(cv4) * cv4))**0.5 print('') print('') print('--------------------------------------') print('rmse_for_cross_val_four_times: ', np.round(rmse_model_cross_fold_4, 2)) print('--------------------------------------') print('') print('') print('') print('Comparing predicted vs truth value in dataset') print('---------------------------------------------------------') fig = plt.figure(figsize=(6, 6)) plt.scatter(y_test, y_prediction) plt.title('predicted vs test: RMSE = %s [units]' % (np.round(rmse_model, 2))) plt.ylabel('predicted value for %s' % (target)) plt.xlabel('test value for %s' % (target)) return mlr