import pandas as pd import Functions from sklearn.linear_model import LinearRegression from sklearn import linear_model from sklearn import cross_validation import Plots data = pd.read_csv('housing_data.csv') feature_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] X = data[feature_cols] y = data.MEDV X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=3) lm = LinearRegression() predicted=Functions.callClassifierFeatures(lm, X_train, y_train, X_test, y_test, feature_cols, 'Linear Regression') # Plotting Plots.scatterPlot(predicted, y_test, 'Fitted', 'Actual', 'Fitted VS Actual LR', 'green', 'HousingLRScatterPlot') Plots.residualPlot(predicted, (predicted - y_test), 'Fitted', 'Residual', 'Fitted VS Residual LR', 'blue', 'HousingLRResidualPlot') # LR - cross val predicted=Functions.callCrossVal(lm, X, y, 10, 'Linear Regression') Plots.scatterPlot(predicted, y, 'Fitted', 'Actual', 'Fitted VS Actual LR-CV', 'green', 'HousingLRScatterPlotCV') Plots.residualPlot(predicted, (predicted - y), 'Fitted', 'Residual', 'Fitted VS Residual LR-CV', 'blue', 'HousingLRResidualPlotCV') # Polynomial Regression Functions.polynomialRegression(lm, X_train, y_train, X_test, y_test, 6,'Linear Regression') Functions.polynomialRegressionCV(lm, X, y, 10, 6, 'Linear Regression') # Ridge ridge = linear_model.RidgeCV(alphas=[0.1, 0.01, 0.001]) Functions.callClassifierFeatures(ridge, X_train, y_train, X_test, y_test,feature_cols, 'Ridge') print("The tuned alpha value selected for Ridge is: %.4f" %ridge.alpha_) Functions.callCrossVal(ridge, X, y, 10, 'Ridge') # Lasso lasso = linear_model.LassoCV(alphas=[0.1, 0.01, 0.001])
for num in range(0,5): Plots.plotWorkFlow(one_hot_subset,num, 'actual') feature_cols = [col for col in one_hot_data.columns if col not in ['Size of Backup (GB)']] X = one_hot_data[feature_cols] y = one_hot_data['Size of Backup (GB)'] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=3) #Linear Regression model = LinearRegression() Functions.callClassifierFeatures(model, X_train, y_train, X_test, y_test, feature_cols, 'Linear Regression') pred = model.predict(X_test) Plots.scatterPlot(pred, y_test, 'Fitted','Actual','Fitted VS Actual','green','NetBkpLRFitvsActual') Plots.residualPlot(pred, pred - y_test, 'Fitted','Resuduals','Fitted VS Residual','green','NetBkpLRFitvsResidual') Functions.callCrossVal(model, X, y, 10, 'Linear Regression') #Random Forest model = RandomForestRegressor(n_estimators=20, max_depth=4, max_features='auto') Functions.callClassifier(model, X_train, y_train, X_test, y_test,'Random Forests Initial') model.fit(X_train, y_train) print('Random Forests Initial - RMSE: %.4f' % (np.sqrt(np.sum((model.predict(X_test) - y_test) ** 2)/y_test.size))) clf = RandomForestRegressor() param_dist = {"n_estimators":sp_randint(1, 100), "max_depth": sp_randint(1, 10), "max_features": sp_randint(1, 45),