def get_trained_coefficients(): """ Create and train a model based on the training_data_file data. Return the model, and the list of coefficients for the 'columns' variables in the regression. """ _, X_train, y_train = get_data(training_data_file) # TODO: create regression model and train. model = lin(fit_intercept=False) model = model.fit(X_train, y_train) coefficients = model.coef_ return model, coefficients
#record column names scaled_df.columns =['id','bedrooms','bathrooms','sqft_living','sqft_lot', 'floors','waterfront' ,'district','condition','yr_built','yr_renovated','zipcode','price'] #x = scaled_df's relevant columns(except price and abs(value)<0.1) x=np.array(pd.DataFrame(scaled_df,columns=cov.index.tolist())) #y = df's price(which has not scaled) y=np.array(pd.DataFrame(df,columns=['price'])) #linear regression model valdiation reg = lin(fit_intercept=True, normalize=False, n_jobs=None) reg.fit(x, y) #print accuracy of linear regression model accuracy = cross_val_score(reg, x, y, cv = 5) acc = round(sum(accuracy)/5, 2) print ("Linear Regression test file accuracy:"+str(acc)) #compare predicted price and real price y_pred = reg.predict(x) plt.scatter(y,y_pred) #plot x = y plt.plot([0,5000000],[0,5000000],color='black',lw=2,linestyle='solid') plt.xlabel("real price($100,000)") plt.ylabel("predicted price($100,000)") plt.show()
plt.title("Cost V/S Iterations") plt.show() print("The optimum parameters for the given data set is: \n", new_params) # Printing Error values of my model... print("\n\nError values: (Lower is better) : \nMY PREDICTOR:") predictions = test_features.dot(new_params) print("The root mean square error is: ", root_mean_sq_err(test_targets, predictions)) print("The r^2 score is: ", r2_score(test_targets, predictions), "\n\n") # Building a linear regressor from Sklearn for comparision... reg = lin() reg.fit(train_features, train_targets) skpred = reg.predict(test_features) # Printing Sklearn's error values... print("SKLEARN :") print("The root mean square error of sklearn is: ", root_mean_sq_err(test_targets, skpred)) print("The r^2 score of sklearn is: ", r2_score(test_targets, skpred), "\n\n") errRate=0.3 print(f"The accuracy for the model is {simpleAccuracy(test_targets,predictions,errRate)}") # Printing first 10 predictions of my model and sklearn side by side to actual values... #print("ACTUAL \t\t\t\t MY MODEL \t\t\t\t\t SKLEARN\n") #for i in range(10):
df_f = df_f.join(two_df,rsuffix = 'city_') df_f['State'] = le.fit_transform(df_f['State']) three_df = pd.DataFrame(one.fit_transform(df_f[['State']]).toarray()) df_f = df_f.join(three_df,rsuffix = '_state') df_f = df_f.drop(['Raw Labor Classification','State','city'], axis = 1) df_f = df_f.dropna() y = df_f.iloc[:,0:1].values x= df_f.iloc[:,1:].values x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.33, random_state=42) # Linear Regression ( Very low Accuracy ) regressor = lin() regressor.fit(x_train, y_train) regressor.score(x_test,y_test) cv_score = cross_val_score(regressor,x_train,y_train,cv = 10) cv_score.mean() y_pred = regressor.predict(x_test) forecast_accuracy(y_pred,y_test) #Random Forest Regressor (good cv, good mape) reg =RR(n_estimators = 100) reg.fit(x_train, y_train) cv_score = cross_val_score(reg,x_train,y_train,cv = 10) cv_score.mean() y_pred = reg.predict(x_test) reg.score(x_test,y_test) forecast_accuracy(y_pred,y_test)