def regression_score(true_data, predict_data): assert (true_data.shape == predict_data.shape) if len(true_data.shape) == 1 or true_data.shape[1] == 1: return explained_variance_score(true_data, predict_data) else: return np.mean([explained_variance_score(true_data[:, index], predict_data[:, index]) for index in range(true_data.shape[1])])
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 13) assert_almost_equal(zero_one(y_true, y_pred, normalize=True), 13 / float(n_samples), 2) assert_almost_equal(zero_one_loss(y_true, y_pred), 13 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 13. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) assert_equal(accuracy_score(y_true, y_pred, normalize=False), n_samples - zero_one_loss(y_true, y_pred, normalize=False)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def runRegressor( clf,featureMat,targets,no_of_training_example ): try: clf.fit(featureMat[:no_of_training_example,:], targets[:no_of_training_example]) y_pred = clf.predict(featureMat[no_of_training_example:,:]) print 'Variance Score' print explained_variance_score(targets[no_of_training_example:], y_pred) print 'Mean absolute error' print mean_absolute_error(targets[no_of_training_example:], y_pred) print 'Explained variance score' print explained_variance_score(targets[no_of_training_example:], y_pred) except Exception, e: print e
def test_regression_multioutput_array(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. y_true = [[0, 0]]*4 y_pred = [[1, 1]]*4 mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [1., 1.], decimal=2) assert_array_almost_equal(mae, [1., 1.], decimal=2) assert_array_almost_equal(r, [0., 0.], decimal=2) r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(r, [0, -3.5], decimal=2) assert_equal(np.mean(r), r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='uniform_average')) evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(evs, [0, -1.25], decimal=2) # Checking for the condition in which both numerator and denominator is # zero. y_true = [[1, 3], [-1, 2]] y_pred = [[1, 4], [-1, 1]] r2 = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(r2, [1., -3.], decimal=2) assert_equal(np.mean(r2), r2_score(y_true, y_pred, multioutput='uniform_average')) evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(evs, [1., -3.], decimal=2) assert_equal(np.mean(evs), explained_variance_score(y_true, y_pred)) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values') msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput='raw_values') assert_array_almost_equal(msle, msle2, decimal=2)
def EXP_VAR(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) try: return explained_variance_score(flux_arr[inds], back_trans_flux, multioutput=multioutput) except: return float(np.mean(np.var(flux_arr[inds] - back_trans_flux, axis=1) / np.var(flux_arr[inds], axis=1))) else: try: return explained_variance_score(Y, y, multioutput=multioutput) except: return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # symmetric assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_almost_equal(mean_squared_error(y_true, y_pred), mean_squared_error(y_pred, y_true)) # not symmetric assert_true(explained_variance_score(y_true, y_pred) != explained_variance_score(y_pred, y_true)) assert_true(r2_score(y_true, y_pred) != r2_score(y_pred, y_true))
def svm(X,Y,k): if k > 0: y = Y totalRSS = 0 totalR_sq = 0 totalev = 0 kf = KFold(len(X), n_folds=k) for train_index, test_index in kf: x_train, y_train = X[train_index], y[train_index] x_test,y_test = X[test_index], y[test_index] clf = SVR(kernel='rbf',C=1e5,degree=5) clf.fit(x_train,y_train) pred = clf.predict(x_test) #print("Residual sum of squares: %.5f" #% np.mean((clf.predict(x_test) - y_test) ** 2)) pred = clf.predict(x_test) totalRSS += np.mean((clf.predict(x_test) - y_test) ** 2) totalR_sq += r2_score(y_test, pred) totalev += explained_variance_score(y_test,pred) print("Residual sum of squares: {}".format(float(totalRSS)/float(k))) print('R^2 score: {}'.format(float(totalR_sq)/float(k))) print('explained variance score: {}'.format(float(totalev)/float(k))) else: X = preprocessing.scale(X) cutoff = int(len(X)*.7) x_train,y_train = X[:cutoff], Y[:cutoff] x_test, y_test = X[(cutoff+1):],Y[(cutoff+1):] clf = SVR(kernel='rbf',C=1e5,degree=5) clf.fit(x_train,y_train) pred = clf.predict(x_test) print("Residual sum of squares: %.5f" % np.mean((clf.predict(x_test) - y_test) ** 2)) pred = clf.predict(x_test) # Explained variance score: 1 is perfect prediction print('R^2 score: %.8f' % r2_score(y_test, pred)) print('explained variance score: %.8f' %explained_variance_score(y_test,pred)) return
def main(args): if len(sys.argv) < 2: print("USAGE: python linear_regression.py [feature matrix] [values]") exit(0) X = np.genfromtxt(args[0], delimiter=',') Y = np.genfromtxt(args[1], delimiter=',') X = util.process_X(X) # X = util.item_item_collab_filtering(X, 100, -1) if('dap' in args[0]): X = util.fill_mean2(X) else: X = util.fill_mean(X,-1) print X X = util.variance_threshold(X, 1) kfolds = False if len(args) >= 3: kfolds = True if kfolds: kf = KFold(len(X), n_folds=int(args[2])) for train_index, test_index in kf: x_train, y_train = X[train_index], Y[train_index] x_test,y_test = X[test_index], Y[test_index] regr = linear_model.LinearRegression() regr.fit(x_train,y_train) print("Residual sum of squares: %.5f" % np.mean((regr.predict(x_test) - y_test) ** 2)) pred = regr.predict(x_test) # Explained variance score: 1 is perfect prediction print('Variance score: %.8f' % regr.score(x_test, y_test)) print('R^2 score: %.8f' % r2_score(y_test, pred)) print('explained variance score: %.8f' % explained_variance_score(y_test,pred)) print '\n' else: cutoff = int(len(X)*.7) x_train, y_train = X[:cutoff], Y[:cutoff] x_test,y_test = X[(cutoff+1):], Y[(cutoff+1):] regr = linear_model.LinearRegression() regr.fit(x_train,y_train) print("Residual sum of squares: %.5f" % np.mean((regr.predict(x_test) - y_test) ** 2)) pred = regr.predict(x_test) # Explained variance score: 1 is perfect prediction print('Variance score: %.8f' % regr.score(x_test, y_test)) print('R^2 score: %.8f' % r2_score(y_test, pred)) print('explained variance score: %.8f' %explained_variance_score(y_test,pred))
def plotResults(predicted, expected, output): """ Generate a simple plot demonstrating the results. """ var = metrics.explained_variance_score(expected, predicted) mae = metrics.mean_absolute_error(expected, predicted) mse = metrics.mean_squared_error(expected, predicted) r2 = metrics.r2_score(expected, predicted) rms = np.sqrt(np.mean((expected - predicted) ** 2)) print output print 'Explained variance (best possible score is 1.0, lower values are worse):', var print 'Mean Absolute Error (best is 0.0):', mae print 'Mean Squred Error (best is 0.0):', mse print 'R2 score (best is 1.0):', r2 print 'RMS:', rms print '\n\n\n' title = 'RMS=%.4f, MSE=%.4f, R2=%.3f' % (rms, mse, r2) fig = plt.figure() ax1 = fig.add_subplot(111) plt.title(title) ax1.scatter(expected, predicted, alpha=0.2, s=5) ax1.set_xlabel("Spectroscopic Redshift") ax1.set_ylabel("Photo-z") ax1.plot([0, 8], [0, 8], '-r') ax1.set_xlim(0, 1.1*expected.max()) ax1.set_ylim(0, 1.1*expected.max()) plt.savefig(output+'Results.pdf') plt.close()
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 3. YOUR CODE GOES HERE ### ################################### # The following page has a table of scoring functions in sklearn: # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics # In order to study all of the different performance metrics, I will simply # calculate them all and return a dictionary with all of the results l, p = label, prediction output = collections.OrderedDict() output["explained variance score"] = skmetrics.explained_variance_score( l, p) output["mean absolute error"] = skmetrics.mean_absolute_error(l, p) output["mean squared error"] = skmetrics.mean_squared_error(l, p) output["root mean squared error"] = np.sqrt( skmetrics.mean_squared_error(l, p)) output["median absolute error"] = skmetrics.median_absolute_error(l, p) output["r2 score"] = skmetrics.r2_score(l, p) return output
def across_all_appliances(scores, mains, aggregate_predictions): total_sum_abs_diff = 0.0 for appliance_scores in scores.values(): total_sum_abs_diff += appliance_scores['sum_abs_diff'] # Total energy correctly assigned # See Eq(1) on p5 of Kolter & Johnson 2011 denominator = 2 * np.sum(mains) total_energy_correctly_assigned = 1 - (total_sum_abs_diff / denominator) total_energy_correctly_assigned = float(total_energy_correctly_assigned) # explained variance n = min(len(mains), len(aggregate_predictions)) mains = mains[:n] aggregate_predictions = aggregate_predictions[:n] scores['across all appliances'] = { 'total_energy_correctly_assigned': total_energy_correctly_assigned, 'explained_variance_score': float( metrics.explained_variance_score(mains, aggregate_predictions)), 'mean_absolute_error': float( np.mean( [scores[app]['mean_absolute_error'] for app in scores])), 'relative_error_in_total_energy': float( np.mean( [scores[app]['relative_error_in_total_energy'] for app in scores])), } scores['across all appliances'].update({ metric: float(np.mean([scores[app][metric] for app in scores])) for metric in METRICS['classification'] }) return scores
def actvspred(modelname, predmodel): """ plot the predicted vs. the actual score """ predscores, actualscores, meanerr, rmsqerr = predmodel axmax = int(round(np.max([predscores,actualscores]))) axmin = int(round(np.min([predscores,actualscores]))) # fit line through the scores actualscores2 = actualscores.reshape(subject_num,1) model = lm.LinearRegression() model.fit(actualscores2, predscores) # get explained variance rsqrd = skm.explained_variance_score(actualscores, predscores) x = np.array(range(axmin-5, axmax+6)) y = model.coef_[0]*x+model.intercept_ # plot scatterplot and lines plt.figure() plt.scatter(actualscores,predscores,s=70) plt.plot(x,x,'g',label='optimal model') plt.plot(x,y,'k',label='our model',linewidth=2) plt.xlabel("actual lsas delta") plt.ylabel("predicted lsas delta") plt.title(modelname) plt.axis([axmin-5,axmax+5,axmin-5,axmax+5]) axes = plt.axes() axes.grid(b=True) axes.text(0.05,0.8,"meanerr: %.2f\nrmse: %.2f\nexpl. var: %.2f"%(meanerr,rmsqerr,rsqrd),transform=axes.transAxes) #plt.legend() plt.savefig(os.path.join(outdir,"%s_crossval.png"%modelname),dpi=100,format="png")
def plot_expl_var(y_true, y_pred, vari, lev, label=None): expl_var = metrics.explained_variance_score(y_true, y_pred, multioutput='raw_values') plt.plot(unpack(expl_var, vari, axis=0), lev, label=label) plt.ylim([np.amax(lev), np.amin(lev)]) plt.ylabel('$\sigma$') plt.title('Explained Variance Regression Score')
def train_and_evaluate(clf, X_train, X_test, y_train, y_test): clf.fit(X_train, y_train) #print("Accuracy on training set:") #print(clf.score(X_train, y_train)) #print("Accuracy on testing set:") #print(clf.score(X_test, y_test)) y_predicted = clf.predict(X_test) vysledek = (mean_absolute_error(y_test, y_predicted), mean_squared_error(y_test, y_predicted), r2_score(y_test, y_predicted), explained_variance_score(y_test, y_predicted)) #print("mean_absolute_error:") #print(vysledek[0]) #print("mean_squared_error:") #print(vysledek[1]) #print("r2_score:") #print(vysledek[2]) #print("explained_variance_score:") #print(vysledek[3]) return vysledek
def linreg(y,X): # Split the data into training/testing sets X_train = X[:-2000] X_test = X[-2000:] # Split the targets into training/testing sets y_train = y[:-2000] y_test = y[-2000:] # Create linear regression object regr = linear_model.LinearRegression(normalize=True) # Train the model using the training sets regr.fit(X_train, y_train) y_pred = regr.predict(X_test) # The intersept #print('Intercept: \n', regr.intercept_ ) # The coefficients #print('Coefficients: \n', regr.coef_) # The mean square error print("Residual sum of squares:") print(((y_pred - y_test) ** 2).sum()) #print((((y_test - y_test.mean()) ** 2).sum())/(len(y_test)-1)) print("Variance:") print(y_test.var()) # Explained variance score: 1 is perfect prediction print('Variance score: %.6f' % explained_variance_score(y_test, y_pred, multioutput='variance_weighted')) return regr
def score(): methods = ['cro_cnn', 'cro_knn', 'cro_svm', 'mon_ann', 'mon_knn', 'mon_svm', 'day_ann', 'day_knn', 'day_svm'] result_tmp1 = np.empty(0) result_tmp2 = np.empty(0) for fx in FX_LIST: data = pd.read_pickle('%s/summary_%s.pkl' % (PREX, fx)) for method in methods: score1 = metrics.mean_squared_error(data['real'], data[method]) result_tmp1 = np.append(result_tmp1, score1) score2 = metrics.explained_variance_score( data['real'], data[method]) result_tmp2 = np.append(result_tmp2, score2) result1 = pd.DataFrame(result_tmp1.reshape(-1, len(methods)), index=FX_LIST, columns=methods) result2 = pd.DataFrame(result_tmp2.reshape(-1, len(methods)), index=FX_LIST, columns=methods) result1.to_pickle('%s/summary_mse.pkl' % PREX) result2.to_pickle('%s/summary_evs.pkl' % PREX) return result1, result2
def print_reg_metrics(y_test, y_pred): print '%s %s' % ('metric'.center(20), 'value'.center(12)) print '-------------------- ------------' print 'explained variance: %12.3f' % metrics.explained_variance_score(y_test, y_pred) print 'mean absolute error: %12.3f' % metrics.mean_absolute_error(y_test, y_pred) print 'mean squared error: %12.3f' % metrics.mean_squared_error(y_test, y_pred) print 'R-squared score: %12.3f' % metrics.r2_score(y_test, y_pred)
def estimate_performance(file, features_to_discard): methods = [#("Lasso Regression", linear_model.Lasso(alpha = 0.05)), #("Gaussian Processes", gaussian_process.GaussianProcess(theta0=1e-2, corr='absolute_exponential')), ("SVR", svm.SVR(kernel="linear", C=1e3, degree=4))] [_, y_benchmark, num_benchmarks] = preprocess(file, normalize_data, BENCHMARK_LABEL_FIELD, features_to_discard) [X_throughput_combined, y_throughput_combined, num_throughputs] = preprocess(file, normalize_data, THROUGHPUT_LABEL_FIELD, features_to_discard) for name, instance in methods: print("===========================================================================") print("Using method %s" % name) print("===========================================================================") for benchmark_number in range(num_benchmarks): print("-----------------------") print("Estimating for benchmark %s" % benchmark_list[benchmark_number][0]) print("-----------------------") sample_filter = y_benchmark == benchmark_number X_throughput = X_throughput_combined[sample_filter, :] y_throughput = y_throughput_combined[sample_filter] print("Found %d samples, doing two-way CV" % X_throughput.shape[0]) [X_train, y_train, X_test, y_test] = split_data(X_throughput, y_throughput, 2) instance.fit(X_train, y_train) y_pred = instance.predict(X_test) np.set_printoptions(suppress=True) print(y_test[:20]) print(y_pred[:20]) print("Estimator got R2 Score %f" % r2_score(y_test, y_pred)) print("Estimator got Explained Variance %f" % explained_variance_score(y_test, y_pred))
def cross_validate_predictor(data, features, clf_options, output_filename=None): print(clf_options) data_x = data[features].values data_y = data['ddg_exp'].values cv = cross_validation.LeaveOneLabelOut(data['label'].values) clf = ensemble.GradientBoostingRegressor(**clf_options) y_pred_all = [] y_true_all = [] for train, test in cv: x_train = data_x[train] y_train = data_y[train] x_test = data_x[test] y_test = data_y[test] clf.fit(x_train, y_train) probas_ = clf.predict(x_test) y_pred_all.extend(probas_) y_true_all.extend(y_test) results = clf_options.copy() results['n_features'] = len(features) results['features'] = ','.join(features) results['explained_variance_score'] = metrics.explained_variance_score(y_true_all, y_pred_all) results['mean_absolute_error'] = metrics.mean_absolute_error(y_true_all, y_pred_all) results['mean_squared_error'] = metrics.mean_squared_error(y_true_all, y_pred_all) results['r2_score'] = metrics.r2_score(y_true_all, y_pred_all) if output_filename is not None: write_row_to_file(results, output_filename) return results, y_true_all, y_pred_all
def cli(dataset_path, out_file): """Train a new model. This will train a new model using the provided dataset, trained model will be dumped to OUT file. """ data = pd.read_csv(dataset_path) data = data.dropna(axis=0) # Just drop empty values X = data[FEATURES] y = data['Price'] train_X, test_X, train_y, test_y = train_test_split( X, y, test_size=0.2, random_state=1 ) model = HousePricePredictor() model.fit(train_X, train_y) model.dump(out_file) predictions = model.predict(test_X) print("Mean Absolute Error : " + str( mean_absolute_error(predictions, test_y))) print("Explained Variance Score :" + str( explained_variance_score(predictions, test_y))) print("R2 Score :" + str(r2_score(predictions, test_y)))
def print_evaluations(Y_true, Y_pred, classification=True): if classification: report = classification_report(Y_true, Y_pred) logging.info('Classification report:\n%s' % str(report)) cm = confusion_matrix(Y_true, Y_pred) logging.info('Confusion Matrix:\n%s' % str(cm)) # fig = plt.figure() # ax = fig.add_subplot(111) # cax = ax.matshow(cm) # fig.colorbar(cax) # # ax.set_xticklabels(['']+['-1', '0', '1']) # ax.set_yticklabels(['']+['-1', '0', '1']) # # plt.title('Confusion Matrix') # plt.ylabel('True label') # plt.xlabel('Predicted label') # plt.show(block=False) else: var = explained_variance_score(Y_true, Y_pred) logging.info('Explained variance (best=1.0): %f' % var) mae = mean_absolute_error(Y_true, Y_pred) logging.info('Mean absolute error (best=0.0): %f' % mae) mse = mean_squared_error(Y_true, Y_pred) logging.info('Mean squared error (best=0.0): %f' % mse) r2 = r2_score(Y_true, Y_pred) logging.info('R squared score (best=1.0): %f' % r2)
def regression_metrics( csv_test, csv_result, last_or_first ): real_results = [] predicted_results = [] with open(csv_test, 'rb') as csv_test_file: csv_test_reader = csv.reader(csv_test_file, delimiter=',', quotechar='"') for row in csv_test_reader: if last_or_first == 'first_field': real_results.append(float(row.pop(0))) else: real_results.append(float(row.pop())) with open(csv_result, 'rb') as csv_result_file: csv_result_reader = csv.reader(csv_result_file, delimiter=',', quotechar='"') for row in csv_result_reader: if last_or_first == 'first_field': predicted_results.append(float(row.pop(0))) else: predicted_results.append(float(row.pop())) labels = list(set(real_results)) print('Explained variance score: %f' % explained_variance_score(real_results, predicted_results)) print('Mean squared error: %f' % mean_squared_error(real_results, predicted_results)) print('Mean absolute error: %f' % mean_absolute_error(real_results, predicted_results))
def display(reg, reg_name): reg=reg.fit(x_train,y_train) y_pred=reg.predict(x_test) r2 = reg.score(x_test,y_test) lst_reg.append(reg_name) rms = sqrt(mean_squared_error(y_test, y_pred)) #print("The Root mean square error for the Regressor is: "+str(rms)) rms = round(rms,2) lst_rms.append(str(rms)) r2 = r2_score(y_test,y_pred) #print("r squared value: "+str(r2)) r2 = round(r2,2) lst_r2.append(r2) var_score = explained_variance_score(y_test,y_pred) #print("Variance Score: "+str(var_score)) var_score = round(var_score,2) lst_vs.append(var_score) mean_abs_error=mean_absolute_error(y_test,y_pred) #print("Mean Absolute Error: "+str(mean_abs_error)) mean_abs_error = round(mean_abs_error,2) lst_mae.append(mean_abs_error) #print(reg.coef_,reg.intercept_) dic['Regressor'] = lst_reg dic['RMSE'] = lst_rms dic['R Square'] = lst_r2 dic['Var Score'] = lst_vs dic['Mean Abs Err'] = lst_mae
def pearso(name, X1, X2): print X1.shape, X2.shape print type(X1) print "Pearson correlation %s %f" %(name, pearsonr(X1,X2)[0]) print "Correct samples %s %f" %(name, 1-((X1-X2)!=0).sum()/float(len(X1))) print "RMSE %f" % sqrt(sum((X1-X2)**2)/len(X1)) print "Explained Variance Score %f" % metrics.explained_variance_score(X1,X2)
def exp_var( rating_true, rating_pred, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, ): """Calculate explained variance. Args: rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs col_user (str): column name for user col_item (str): column name for item col_rating (str): column name for rating col_prediction (str): column name for prediction Returns: float: Explained variance (min=0, max=1). """ y_true, y_pred = merge_rating_true_pred( rating_true=rating_true, rating_pred=rating_pred, col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, ) return explained_variance_score(y_true, y_pred)
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n = y_true.shape[0] assert_equal(zero_one(y_true, y_pred), 13) assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def displayResults(clf, title): print "\n\n=== Reuslt of ", title, " ===" y_pred_raw = clf.predict(X_test) y_pred = scaler.inverse_transform(y_pred_raw[:20]) y_true_raw = y_test y_true = scaler.inverse_transform(y_true_raw[:20]) print "\npredicted result, true result" for i in range(len(y_true)): print y_pred[i], "\t", y_true[i] print "\nr2_score:" print r2_score(y_true_raw, y_pred_raw) print "\nexplained_variance_score:" print explained_variance_score(y_true_raw, y_pred_raw) print "\nmean_squared_error:" print mean_squared_error(y_true_raw, y_pred_raw)
def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) y_pred = y_true + 1 assert_almost_equal(mean_squared_error(y_true, y_pred), 1.) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
def printMetrics(estimator, X_train, y_train, y_test, y_pred): scores = cross_validation.cross_val_score(estimator, X_train,y_train, cv=5) print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2)) print "EVS: %.4f" % explained_variance_score(y_test, y_pred) print "MAE: %.4f" % mean_absolute_error(y_test, y_pred) print "MSE: %.4f" % mean_squared_error(y_test, y_pred) print "R2: %.4f" % r2_score(y_test, y_pred)
def score(self, X, y): if self.model_type == 'classification': yhat = self.predict(X) return np.mean(yhat == y) elif self.model_type == 'regression': yhat = self.predict(X) return metrics.explained_variance_score(y, yhat) else: raise RuntimeError('unknown model type')
def evaluate(y_actual, y_predicted): explained_variance = explained_variance_score(y_actual, y_predicted) pearson = pearsonr(y_actual, y_predicted) rms = sqrt(mean_squared_error(y_actual, y_predicted)) return (explained_variance, pearson[0], rms)
for train_index, test_index in split(new_data, n_splits=3): # print("TRAIN:", train_index, "TEST:", test_index) new_data_train, new_data_test = new_data.ix[train_index], new_data.ix[ test_index] target_train, target_test = target.ix[train_index], target.ix[test_index] # print(list(map(tuple, np.where(np.isnan(new_data_train))))) # print(new_data_train.ix[[605]]) ###any nan or infinite # print(np.any(np.isnan(new_data_train)),np.all(np.isfinite(new_data_test)), ### estimator.fit(new_data_train, target_train) # print(new_data_test) target_pred = estimator.predict(new_data_test.values) print("r2 score:", r2_score(target_test, target_pred), 'explained variance score:', explained_variance_score(target_test, target_pred), 'mean_squared_error', mean_squared_error(target_test, target_pred), 'mean_absolute_error', mean_absolute_error(target_test, target_pred), 'median_absolute_error', median_absolute_error(target_test, target_pred)) # print(estimator.best_params_, estimator.best_estimator_) # print(estimator.alpha_) # print(estimator.best_estimator_.coef_, estimator.best_estimator_.residues_, estimator.best_estimator_.intercept_) ###samples to see the result of prediction # print(good_data.ix[1624,'y'],estimator.predict(good_data.ix[1624,:].drop('y').values)) # print(good_data.ix[14,'y'],estimator.predict(good_data.ix[14,:].drop('y').values)) # print(good_data.ix[164,'y'],estimator.predict(good_data.ix[164,:].drop('y').values)) # print(good_data.ix[333,'y'],estimator.predict(good_data.ix[333,:].drop('y').values)) # print(good_data.ix[1000,'y'],estimator.predict(good_data.ix[1000,:].drop('y').values))
print('ELASTICNET REGRESSION') print(df1) clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,learning_rate = 0.1, loss = 'ls') clf.fit(X_train,y_train) y_pred=clf.predict(X_test) df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) df1 = df.head(10) print("****************************************************************************") print('GRADIENTBOOST REGRESSION') print(df1) x=[regressor.score(X_train,y_train)*100,rr.score(X_train,y_train)*100,model_lasso.score(X_train,y_train)*100,model_enet.score(X_train,y_train)*100,clf.score(X_train,y_train)*100] y=[regressor.score(X_test, y_test)*100,rr.score(X_test, y_test)*100,model_lasso.score(X_test, y_test)*100,model_enet.score(X_test, y_test)*100,clf.score(X_test, y_test)*100] z=[metrics.mean_absolute_error(y_test, y_predd),metrics.mean_absolute_error(y_test,pred_test_rr),metrics.mean_absolute_error(y_test,pred_test_lasso),metrics.mean_absolute_error(y_test,pred_test_enet),metrics.mean_absolute_error(y_test, y_pred)] h=[metrics.mean_squared_error(y_test, y_predd),metrics.mean_squared_error(y_test,pred_test_rr),metrics.mean_squared_error(y_test,pred_test_lasso),metrics.mean_squared_error(y_test,pred_test_enet),metrics.mean_squared_error(y_test, y_pred)] g=[np.sqrt(metrics.mean_squared_error(y_test, y_predd)),np.sqrt(metrics.mean_squared_error(y_test,pred_test_rr)), np.sqrt(metrics.mean_squared_error(y_test,pred_test_lasso)),np.sqrt(metrics.mean_squared_error(y_test,pred_test_enet)),np.sqrt(metrics.mean_squared_error(y_test, y_pred))] v=[metrics.explained_variance_score(y_test,y_predd),metrics.explained_variance_score(y_test,pred_test_rr),metrics.explained_variance_score(y_test,pred_test_lasso),metrics.explained_variance_score(y_test,pred_test_enet),metrics.explained_variance_score(y_test, y_pred)] print("****************************************************************************") data = pd.DataFrame(np.column_stack([x,y,z,h,g,v]),columns=['Train Score','Test Score','Mean Absolute Error','Mean Squared Error','Root Mean Squared Error','Variance'],index= ['Linear Regression Model:','Ridge Regression Model:','Lasso Regression Model:','ElasticNet Regression Model:','GradientBoosting Regression Model:']) print(data.to_string()) print("****************************************************************************") fig = plt.figure(figsize=(10,5)) fig.add_subplot(3,2,1) plt.scatter(y_test,y_predd) plt.title(" MULTIPLE LINEAR REGRESSION ") plt.ylabel('predicted value') plt.xlabel('Actual price') fig.add_subplot(3,2,2) plt.scatter(y_test,pred_test_rr,color='purple') plt.title("RIDGE REGRESSION ") plt.ylabel('predicted value') plt.xlabel('Actual price');
x, y = shuffle(housing_data.data, housing_data.target, random_state=7) num_training = int(len(x) * 0.8) x_train, y_train = x[:num_training], y[:num_training] x_test, y_test = x[num_training:], y[num_training:] dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(x_train, y_train) ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7) ab_regressor.fit(x_train, y_train) y_test_pred_dt = dt_regressor.predict(x_test) y_test_pred_ab = ab_regressor.predict(x_test) ab_regressor_mse = round(mean_squared_error(y_test, y_test_pred_ab), 2) ab_regressor_evs = round(explained_variance_score(y_test, y_test_pred_ab), 2) dt_regressor_mse = round(mean_squared_error(y_test, y_test_pred_dt), 2) dt_regressor_evs = round(explained_variance_score(y_test, y_test_pred_dt), 2) print( "\nab_regressor mean_squared_error ={0:.2f}; explained_variance_score={1:.2f}" .format(ab_regressor_mse, ab_regressor_evs)) print( "\ndt_regressor mean_squared_error={0:.2f}; explained_variance_score={1:.2f}" .format(dt_regressor_mse, dt_regressor_evs)) plot_feature_importance(dt_regressor.feature_importances_, 'dt_regressor', housing_data.feature_names) plot_feature_importance(ab_regressor.feature_importances_, 'ab_regressor', housing_data.feature_names)
def test_losses_at_limits(): # test limit cases assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2)
def evaluate_regression(y_true, predictions, validation_loss, epoch_counter_train, roc_df): # y_true = np.ravel(np.reshape(y_true, (-1,1))) # predictions = np.ravel(np.reshape(predictions, (-1,1))) y_true = np.nan_to_num(y_true) predictions = np.nan_to_num(predictions) mse = mean_squared_error(y_true, predictions) r2 = r2_score(y_true, predictions) mae = mean_absolute_error(y_true, predictions) error_var = np.var(np.abs(y_true - predictions)) explained_var = explained_variance_score(y_true, predictions) if r2 < 0.: r2 = 0. if explained_var < 0.: explained_var = 0. roc_df.append({ 'epoch': epoch_counter_train, # 'train_loss': np.round(last_train_epoch_loss, 5), 'val_loss': np.round(validation_loss, 5), 'mse': np.round(mse, 2), 'r2': np.round(r2, 2), 'mae': np.round(mae, 2), 'error_var': np.round(error_var, 2), 'explained_var': np.round(explained_var, 2), }) print(pd.DataFrame(roc_df)) pd.DataFrame(roc_df).to_csv(args['OUTPATH'] + 'result_df.csv') summary_writer.add_scalar('performance/mse', mse, epoch) summary_writer.add_scalar('performance/r2', r2, epoch) summary_writer.add_scalar('performance/mae', mae, epoch) summary_writer.add_scalar('performance/error_var', error_var, epoch) summary_writer.add_scalar('performance/explained_var', explained_var, epoch) plt.figure(figsize=(12,12)) plt.title('epoch ' + str(epoch_counter_train) + ' | mae ' + str(np.round(mae, 2)) + ' | r2 ' + str(np.round(r2, 2))) plt.scatter(y_true, predictions, c='darkgreen', s=16, alpha=.4) plt.xscale('log') plt.yscale('log') if args['target_label'] == 'length_of_icu': plt.xlim(1., 1000.) plt.ylim(1., 1000.) if args['target_label'] == 'length_of_stay': plt.xlim(1., 2000.) plt.ylim(1., 2000.) plt.grid(which='both') plt.xlabel('Labels [hours spent in ICU]') plt.ylabel('Predictions [hours spent in ICU]') plt.savefig(args['OUTPATH'] + args['target_label'] + '/predictions/' + 'epoch_' + str(epoch_counter_train) + '.pdf') plt.close() performance_x_vec = np.linspace(0, epoch_counter_train, len(pd.DataFrame(roc_df))) plt.figure() plt.plot(performance_x_vec, pd.DataFrame(roc_df)['mse'], c='darkgreen', label='mse', linewidth=4, alpha=.6) plt.yscale('log') plt.xlabel('epochs') plt.ylabel('MSE Loss') plt.title('Mean Squared Error') plt.ylim(1e2,1e5) plt.grid(which='both') plt.legend() plt.savefig(args['OUTPATH'] + args['target_label'] + 'mse.pdf') plt.close() plt.figure() plt.plot(performance_x_vec, pd.DataFrame(roc_df)['r2'], c='darkgreen', label='r2', linewidth=4, alpha=.6) plt.xlabel('epochs') plt.ylabel('R Squared') plt.title('R Squared') plt.grid() plt.legend() plt.savefig(args['OUTPATH'] + args['target_label'] + 'r2.pdf') plt.close() plt.figure() plt.plot(performance_x_vec, pd.DataFrame(roc_df)['mae'], c='darkgreen', label='mae', linewidth=4, alpha=.6) plt.yscale('log') plt.xlabel('epochs') plt.ylabel('Mean Absolute Error [hours spent in ICU]') plt.title('Mean Absolute Error') plt.ylim(10.,100.) plt.yscale('log') plt.grid(which='both') plt.legend() plt.savefig(args['OUTPATH'] + args['target_label'] + 'mae_epoch' + str(epoch_counter_train) + '.pdf') plt.close() plt.figure() plt.plot(performance_x_vec, pd.DataFrame(roc_df)['explained_var'], c='darkgreen', label='explained_var', linewidth=4, alpha=.6) plt.xlabel('epochs') plt.ylabel('Explained Variance') plt.title('Explained Variance') plt.grid() plt.legend() plt.savefig(args['OUTPATH'] + args['target_label'] + 'explained_var.pdf') plt.close() return roc_df
evaluations = [] STEPS = 400 for i in range(100): regressor.train(input_fn=wx_input_fn(X_train, y=y_train), steps=STEPS) evaluations.append(regressor.evaluate(input_fn=wx_input_fn(X_val, y_val, num_epochs=1, shuffle=False))) (100 x 400 / 2) = 20,000 epochs evaluations[0] import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['figure.figsize'] = [14, 10] loss_values = [ev['loss'] for ev in evaluations] training_steps = [ev['global_step'] for ev in evaluations] plt.scatter(x=training_steps, y=loss_values) plt.xlabel('Training steps (Epochs = steps / 2)') plt.ylabel('Loss (SSE)') plt.show() pred = regressor.predict(input_fn=wx_input_fn(X_test, num_epochs=1, shuffle=False)) predictions = np.array([p['predictions'][0] for p in pred]) print("The Explained Variance: %.2f" % explained_variance_score( y_test, predictions)) print("The Mean Absolute Error: %.2f degrees Celcius" % mean_absolute_error( y_test, predictions)) print("The Median Absolute Error: %.2f degrees Celcius" % median_absolute_error( y_test, predictions))
pre = xgb_train.predict(x_test) print('Score : ', explained_variance_score(y_test, pre)) print('MAE : ', mean_absolute_error(y_test, pre)) plt.plot(pre, 'r', y_test, 'b') plt.show() ''' #-------------------------------------------------- #RandomForest rf = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=5) rf.fit(x_train, y_train) pre = rf.predict(x_test) error = abs(pre - y_test) print('Score : ', explained_variance_score(y_test, pre)) print("MAE : ", round(np.mean(error), 2)) plt.plot(pre, 'r', y_test, 'b') plt.show() #------------------------------------------ #特徵重要性 feature_list = list('風' '大' '濕' '環' '模' '照') importance = list(rf.feature_importances_) feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importance) ] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
print("r2_CV:", r2.mean()) print("MSE_CV:", mean_squared_error.mean()) """ Test/Evaluation """ time3 = time.clock() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3) ridge.fit(X_train, y_train) y_pred = ridge.predict(X_test) time4 = time.clock() print("testing time:", time4 - time3) print("EVS_test:", metrics.explained_variance_score(y_test, y_pred)) print("R2_test", metrics.r2_score(y_test, y_pred)) print("MSE_test:", metrics.mean_squared_error(y_test, y_pred)) print("The weights are:", ridge.coef_) """ Visualization """ fig, ax = plt.subplots() ax.scatter(y, predicted, edgecolors=(0, 0, 0)) ax.plot([y.min(), y.max()], [predicted.min(), predicted.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.savefig("cv_ridge.png") fig, ax = plt.subplots() ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
plt.ylabel('Predicted values') # evaluate the model from sklearn import metrics print("MAE = ",metrics.mean_absolute_error(y_test,predictions)) print("MSE = ",metrics.mean_squared_error(y_test,predictions)) print("RMSE = ",np.sqrt(metrics.mean_squared_error(y_test,predictions))) # variance score print(metrics.explained_variance_score(y_test, predictions)) # Residuals sns.distplot((y_test-predictions), bins=75) # mobile or website or length of membership cdf = pd.DataFrame (lm.coef_, X.columns, columns['Coeff']) # Coeff is like a weight 1:26 , 1:38, 1:0.19, 1:61.28 # Website is deficient or App is more effective
count = [] plt.ion() for i in range(len(test_data) - 9): x = test_data[i:i + 9] y = test_data[i + 9:i + 10] x = x.reshape(-1, 1, 9) y = y.reshape(-1, 1) out = net(x) loss = loss_func(out, y) print(loss.item()) label.append(y.numpy().reshape(-1)) output.append(out.data.numpy().reshape(-1)) count.append(i) plt.clf() label_icon, = plt.plot(count, label, linewidth=1, color="blue") output_icon, = plt.plot(count, output, linewidth=1, color="red") plt.legend([label_icon, output_icon], ["label", "output"], loc="upper right", fontsize=10) plt.pause(0.01) plt.savefig("./img.pdf") plt.ioff() plt.show() # print(np.shape(label)) # print(np.shape(output)) r2 = r2_score(label, output) variance = explained_variance_score(label, output) print(r2) print(variance)
def compute_metrics(y_true_cts, y_pred_cts, y_true_bin, y_pred_bin, y_pred_score=None): #Linear Regression metrics regression_dict = {} if y_pred_cts is not None: y_true = y_true_cts y_pred = y_pred_cts regression_dict[ 'explained_variance_score'] = metrics.explained_variance_score( y_true, y_pred) #regression_dict['max_error'] = metrics.max_error(y_true, y_pred) regression_dict['mean_absolute_error'] = metrics.mean_absolute_error( y_true, y_pred) regression_dict['mean_squared_error'] = metrics.mean_squared_error( y_true, y_pred) #regression_dict['mean_squared_log_error'] = metrics.mean_squared_log_error(y_true, y_pred) regression_dict[ 'median_absolute_error'] = metrics.median_absolute_error( y_true, y_pred) regression_dict['r2'] = metrics.r2_score(y_true, y_pred) #create DataFrame regression_metrics = pd.DataFrame.from_dict(regression_dict, orient='index') # ============================================================================= #Classification metrics classification_dict = {} if y_pred_bin is not None: y_true = y_true_bin y_pred = y_pred_bin classification_dict['accuracy_score'] = metrics.accuracy_score( y_true, y_pred) #classification_dict['avg_ps'] = metrics.average_precision_score(y_true, y_score) classification_dict['confusion_matrix'] = metrics.confusion_matrix( y_true, y_pred) classification_dict['f1_score'] = metrics.f1_score(y_true, y_pred) classification_dict['precision_score'] = metrics.precision_score( y_true, y_pred) classification_dict['recall_score'] = metrics.recall_score( y_true, y_pred) if y_pred_score is None: y_pred_score = y_pred classification_dict['roc_auc_score'] = metrics.roc_auc_score( y_true, y_pred_score) #classification_dict['roc_curve'] = metrics.roc_curve(y_true, y_score) classification_dict[ 'gini'] = 2 * classification_dict['roc_auc_score'] - 1 classification_dict['sensibility'] = classification_dict[ 'confusion_matrix'][1, 1] / sum( classification_dict['confusion_matrix'][1, :]) classification_dict['specificity'] = classification_dict[ 'confusion_matrix'][0, 0] / sum( classification_dict['confusion_matrix'][0, :]) #create DataFrame classification_metrics = pd.DataFrame.from_dict(classification_dict, orient='index') # ============================================================================= print(classification_metrics) print(regression_metrics) return regression_metrics, classification_metrics
y1_test = test_inputs['target_load'] y2_test = test_inputs['target_imf9'] y3_test = test_inputs['target_imf10'] y4_test = test_inputs['target_imf8'] y5_test = test_inputs['target_imf7'] y1_preds, y2_preds, y3_preds, y4_preds, y5_preds = model.predict( [X_test, aux_test]) # y1_preds, y2_preds, y3_preds, y4_preds = model.predict([X_test, aux_test]) y1_test = y_scaler.inverse_transform(y1_test) y1_preds = y_scaler.inverse_transform(y1_preds) y1_test, y1_preds = flatten_test_predict(y1_test, y1_preds) rmse_predict = RMSE(y1_test, y1_preds) evs = explained_variance_score(y1_test, y1_preds) mae = mean_absolute_error(y1_test, y1_preds) mse = mean_squared_error(y1_test, y1_preds) msle = mean_squared_log_error(y1_test, y1_preds) meae = median_absolute_error(y1_test, y1_preds) r_square = r2_score(y1_test, y1_preds) mape_v = mape(y1_preds.reshape(-1, 1), y1_test.reshape(-1, 1)) print('rmse_predict:', rmse_predict, "evs:", evs, "mae:", mae, "mse:", mse, "msle:", msle, "meae:", meae, "r2:", r_square, "mape", mape_v) store_predict_points( y1_test, y1_preds, output_dir + '/test_mtl_prediction_epochs_' + str(EPOCHS) + '_lag_' + str(time_step_lag) + '.csv')
def foward_chain_cv(self, scoring_metric, greater_is_better=False): i = 1 MAE = [] Exp_var = [] MSE = [] r_squared = [] params_used = {} y_pred_cont = [] y_test_cont = [] y_pred_cont_index = [] split_dates = [] fig = plt.figure(num='{}'.format(self.regressor)) tscv = TimeSeriesSplit(n_splits=self.no_splits) for train_index, test_index in tqdm(tscv.split(X)): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_test_index = X_test.index.values.tolist() if self.scalar is not None: # Scale Data scaler_X = self.scalar() scaler_y = self.scalar() scaler_X.fit(X_train) scaler_y.fit(y_train) X_train, X_test = scaler_X.transform(X_train), scaler_X.transform(X_test) y_train, y_test = scaler_y.transform(y_train), scaler_y.transform(y_test) else: X_train, X_test = np.asarray(X_train), np.asarray(X_test) y_train, y_test = np.asarray(y_train), np.asarray(y_test) # Find Best Params best_score, best_params = self.find_optimal_paramters( X_train, y_train, self.regressor, self.parameters, scoring_metric, greater_is_better) self.regressor.set_params(**best_params) self.regressor.fit(X_train, y_train.ravel()) # predict y values y_pred = self.regressor.predict(X_test) if self.scalar is not None: # transform y values back to real scale for assessment y_pred = scaler_y.inverse_transform(y_pred) y_test = scaler_y.inverse_transform(y_test) # compute error metrics params_used[i] = best_params MAE.append(metrics.mean_absolute_error(y_test, y_pred)) Exp_var.append(metrics.explained_variance_score(y_test, y_pred)) MSE.append(metrics.mean_squared_error(y_test, y_pred)) r_squared.append(metrics.r2_score(y_test, y_pred)) # plot y_pred vs y_test y_df = pd.DataFrame(index=pd.to_datetime(X_test_index)) y_pred = y_pred.reshape(len(y_pred), ) y_test = y_test.reshape(len(y_test), ) y_df['y_pred'] = y_pred y_df['y_test'] = y_test # plot the subplots ax = fig.add_subplot(int(sqrt(self.no_splits)), int(sqrt(self.no_splits)+1), i) ax.xaxis.set_major_formatter(DateFormatter('%m-%y')) y_df.plot(title = 'Split{}'.format(i), ax=ax, legend=False) ax.tick_params(axis='x', rotation=45, labelsize=8) if i == 1: fig.legend(loc=4) # convert arrays to list and append continuous y_pred vs y_test y_pred_cont_index = y_pred_cont_index + X_test_index split_dates.append(y_pred_cont_index[-1]) y_pred_list = y_pred.tolist() y_test_list = y_test.tolist() y_pred_cont = y_pred_cont + y_pred_list y_test_cont = y_test_cont + y_test_list i += 1 # Plot the continuous chart y_continuous_df = pd.DataFrame(index=pd.to_datetime(y_pred_cont_index)) y_pred_cont = np.asarray(y_pred_cont) y_test_cont = np.asarray(y_test_cont) y_continuous_df['Model'] = y_pred_cont y_continuous_df['Actual'] = y_test_cont y_continuous_df.plot(title='Running Performance') plt.suptitle(str(self.regressor).split('(')[0]) # add verticle lines to the running total output del split_dates[-1] for date in split_dates: date = datetime.strptime(date, '%m/%d/%Y %H:%M') plt.axvline(x=date, linestyle=':', color='red', linewidth=1, alpha=.8) # Calculate average metrics no_splits = tscv.get_n_splits() avg_mae = sum(MAE) / no_splits avg_exp_var = sum(Exp_var) / no_splits avg_mse = sum(MSE) / no_splits avg_rsquared = sum(r_squared) / no_splits print('\nMAE:{} \nMSE:{} \nExp Var Explained: {}\nr^2: {}\nParams:{}'.format(MAE, MSE, Exp_var, r_squared, params_used)) print('\nAvg MAE:', avg_mae, '\nAverage Explained Variance:', avg_exp_var, '\nAvg MSE:', avg_mse, '\nAvg r^2:', avg_rsquared) print('end') fig.tight_layout() plt.show()
# Plot outputs import matplotlib.pyplot as plt plt.scatter(X_test, y_test, color='green') plt.plot(X_test, y_test_pred, color='black', linewidth=4) plt.xticks(()) plt.yticks(()) plt.show() # Measure performance import sklearn.metrics as sm print "Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2) print "Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2) print "Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2) print "Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2) print "R2 score =", round(sm.r2_score(y_test, y_test_pred), 2) # Model persistence import cPickle as pickle output_model_file = '3_model_linear_regr.pkl' with open(output_model_file, 'w') as f: pickle.dump(linear_regressor, f) with open(output_model_file, 'r') as f: model_linregr = pickle.load(f) y_test_pred_new = model_linregr.predict(X_test) print "\nNew mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_new), 2)
def evaluateWithMetrics(true, lstm_predict, reg_predict, simple_avg_predict): sum_lstm_evs = 0 sum_reg_evs = 0 sum_simple_avg_evs = 0 sum_lstm_mse = 0 sum_reg_mse = 0 sum_simple_avg_mse = 0 sum_lstm_mae = 0 sum_reg_mae = 0 sum_simple_avg_mae = 0 sum_lstm_r2_score = 0 sum_reg_r2_score = 0 sum_simple_avg_r2_score = 0 lstm_evs = [] reg_evs = [] avg_evs = [] lstm_mse = [] reg_mse = [] avg_mse = [] lstm_mae = [] reg_mae = [] avg_mae = [] lstm_r2_score = [] reg_r2_score = [] avg_r2_score = [] for i in range(true.shape[0]): r = explained_variance_score(true[i], lstm_predict[i]) lstm_evs.append(r) sum_lstm_evs = sum_lstm_evs + r r = explained_variance_score(true[i], reg_predict[i]) reg_evs.append(r) sum_reg_evs = sum_reg_evs + r r = explained_variance_score(true[i], simple_avg_predict[i]) avg_evs.append(r) sum_simple_avg_evs = sum_simple_avg_evs + r r = mean_squared_error(true[i], lstm_predict[i]) lstm_mse.append(r) sum_lstm_mse = sum_lstm_mse + r r = mean_squared_error(true[i], reg_predict[i]) reg_mse.append(r) sum_reg_mse = sum_reg_mse + r r = mean_squared_error(true[i], simple_avg_predict[i]) avg_mse.append(r) sum_simple_avg_mse = sum_simple_avg_mse + r r = mean_absolute_error(true[i], lstm_predict[i]) lstm_mae.append(r) sum_lstm_mae = sum_lstm_mae + r r = mean_absolute_error(true[i], reg_predict[i]) reg_mae.append(r) sum_reg_mae = sum_reg_mae + r r = mean_absolute_error(true[i], simple_avg_predict[i]) avg_mae.append(r) sum_simple_avg_mae = sum_simple_avg_mae + r r = r2_score(true[i], lstm_predict[i]) lstm_r2_score.append(r) sum_lstm_r2_score = sum_lstm_r2_score + r r = r2_score(true[i], reg_predict[i]) reg_r2_score.append(r) sum_reg_r2_score = sum_reg_r2_score + r r = r2_score(true[i], simple_avg_predict[i]) avg_r2_score.append(r) sum_simple_avg_r2_score = sum_simple_avg_r2_score + r print(f'mae:lstm:{mean_absolute_error(true[i], lstm_predict[i])}, avg:{mean_absolute_error(true[i], simple_avg_predict[i])}') plotMetrics2(lstm_evs, reg_evs, avg_evs, lstm_mse, reg_mse, avg_mse, lstm_mae, reg_mae, avg_mae, lstm_r2_score, reg_r2_score, avg_r2_score) avg_lstm_evs = sum_lstm_evs / true.shape[0] avg_reg_evs = sum_reg_evs / true.shape[0] avg_simple_avg_evs = sum_simple_avg_evs / true.shape[0] avg_lstm_mse = sum_lstm_mse / true.shape[0] avg_reg_mse = sum_reg_mse / true.shape[0] avg_simple_avg_mse = sum_simple_avg_mse / true.shape[0] avg_lstm_mae = sum_lstm_mae / true.shape[0] avg_reg_mae = sum_reg_mae / true.shape[0] avg_simple_avg_mae = sum_simple_avg_mae / true.shape[0] avg_lstm_r2_score = sum_lstm_r2_score / true.shape[0] avg_reg_r2_score = sum_reg_r2_score / true.shape[0] avg_simple_avg_r2_score = sum_simple_avg_r2_score / true.shape[0] print(f'explained variance score: lstm:{avg_lstm_evs}, regression:{avg_reg_evs}, simple avg:{avg_simple_avg_evs}') print(f'mean absolute error: lstm:{avg_lstm_mae}, regression:{avg_reg_mae}, simple avg: {avg_simple_avg_mae}') print(f'mean squared error: lstm:{avg_lstm_mse}, regression:{avg_reg_mse}, simple avg: {avg_simple_avg_mse}') print(f'r2 score: lstm:{avg_lstm_r2_score}, regression:{avg_reg_r2_score}, simple avg: {avg_simple_avg_r2_score}')
# 数据和标签分为训练集和测试集 from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( data, target) #构建线性回归模型 from sklearn.linear_model import LinearRegression clf = LinearRegression().fit(data_train, target_train) quality_pre = clf.predict(data_test) #评价 from sklearn.metrics import mean_squared_error, median_absolute_error, explained_variance_score print("线性回归模型的均方误差为:", mean_squared_error(target_test, quality_pre)) print("线性回归模型的中值误差为:", median_absolute_error(target_test, quality_pre)) print("线性回归模型的可解释方差值为:", explained_variance_score(target_test, quality_pre)) #展示对比 import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = 'SimHei' plt.rcParams['axes.unicode_minus'] = False fig = plt.figure(figsize=(15, 6)) plt.plot(range(target_test.shape[0]), target_test, linewidth=1.5, linestyle='-') plt.plot(range(target_test.shape[0]), quality_pre, linewidth=1.5, linestyle='-.') plt.legend(["真实值", "预测值"])
ilr = parallel_ilr_inference(nb_jobs=args.nb_seeds, train_input=train_input, train_target=train_target, arguments=args)[0] # predict on training mu, var, std, nlpd = \ ilr.meanfield_prediction(input, target, prediction=args.prediction) # metrics from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score mse = mean_squared_error(target, mu) evar = explained_variance_score(target, mu, multioutput='variance_weighted') smse = 1. - r2_score(target, mu, multioutput='variance_weighted') print('TRAIN - EVAR:', evar, 'MSE:', mse, 'SMSE:', smse, 'NLPD:', nlpd.mean(), 'Compnents:', len(ilr.used_labels)) fig, axes = plt.subplots(2, 1) # plot prediction sorter = np.argsort(input, axis=0).flatten() sorted_input, sorted_target = input[sorter, 0], target[sorter, 0] sorted_mu, sorted_std = mu[sorter, 0], std[sorter, 0] axes[0].plot(true_input, true_target, '--k') axes[0].scatter(train_input, train_target, marker='+', s=1.25, color='k')
def generate_metrics(model, partition): r"""Generate model evaluation metrics for all estimators. Parameters ---------- model : alphapy.Model The model object with stored predictions. partition : alphapy.Partition Reference to the dataset. Returns ------- model : alphapy.Model The model object with the completed metrics. Notes ----- AlphaPy takes a brute-force approach to calculating each metric. It calls every scikit-learn function without exception. If the calculation fails for any reason, then the evaluation will still continue without error. References ---------- For more information about model evaluation and the associated metrics, refer to [EVAL]_. .. [EVAL] http://scikit-learn.org/stable/modules/model_evaluation.html """ logger.info('='*80) logger.info("Metrics for: %s", partition) # Extract model paramters. model_type = model.specs['model_type'] # Extract model data. if partition == Partition.train: expected = model.y_train else: expected = model.y_test # Generate Metrics if expected.any(): # Add blended model to the list of algorithms. if len(model.algolist) > 1: algolist = copy(model.algolist) algolist.append('BLEND') else: algolist = model.algolist # get the metrics for each algorithm for algo in algolist: # get predictions for the given algorithm predicted = model.preds[(algo, partition)] # classification metrics if model_type == ModelType.classification: probas = model.probas[(algo, partition)] try: model.metrics[(algo, partition, 'accuracy')] = accuracy_score(expected, predicted) except: logger.info("Accuracy Score not calculated") try: model.metrics[(algo, partition, 'average_precision')] = average_precision_score(expected, probas) except: logger.info("Average Precision Score not calculated") try: model.metrics[(algo, partition, 'balanced_accuracy')] = balanced_accuracy_score(expected, predicted) except: logger.info("Accuracy Score not calculated") try: model.metrics[(algo, partition, 'brier_score_loss')] = brier_score_loss(expected, probas) except: logger.info("Brier Score not calculated") try: model.metrics[(algo, partition, 'cohen_kappa')] = cohen_kappa_score(expected, predicted) except: logger.info("Cohen's Kappa Score not calculated") try: model.metrics[(algo, partition, 'confusion_matrix')] = confusion_matrix(expected, predicted) except: logger.info("Confusion Matrix not calculated") try: model.metrics[(algo, partition, 'f1')] = f1_score(expected, predicted) except: logger.info("F1 Score not calculated") try: model.metrics[(algo, partition, 'neg_log_loss')] = log_loss(expected, probas) except: logger.info("Log Loss not calculated") try: model.metrics[(algo, partition, 'precision')] = precision_score(expected, predicted) except: logger.info("Precision Score not calculated") try: model.metrics[(algo, partition, 'recall')] = recall_score(expected, predicted) except: logger.info("Recall Score not calculated") try: fpr, tpr, _ = roc_curve(expected, probas) model.metrics[(algo, partition, 'roc_auc')] = auc(fpr, tpr) except: logger.info("ROC AUC Score not calculated") # regression metrics elif model_type == ModelType.regression: try: model.metrics[(algo, partition, 'explained_variance')] = explained_variance_score(expected, predicted) except: logger.info("Explained Variance Score not calculated") try: model.metrics[(algo, partition, 'neg_mean_absolute_error')] = mean_absolute_error(expected, predicted) except: logger.info("Mean Absolute Error not calculated") try: model.metrics[(algo, partition, 'neg_median_absolute_error')] = median_absolute_error(expected, predicted) except: logger.info("Median Absolute Error not calculated") try: model.metrics[(algo, partition, 'neg_mean_squared_error')] = mean_squared_error(expected, predicted) except: logger.info("Mean Squared Error not calculated") try: model.metrics[(algo, partition, 'neg_mean_squared_log_error')] = mean_squared_log_error(expected, predicted) except: logger.info("Mean Squared Log Error not calculated") try: model.metrics[(algo, partition, 'r2')] = r2_score(expected, predicted) except: logger.info("R-Squared Score not calculated") # log the metrics for each algorithm for algo in model.algolist: logger.info('-'*80) logger.info("Algorithm: %s", algo) metrics = [(k[2], v) for k, v in list(model.metrics.items()) if k[0] == algo and k[1] == partition] for key, value in sorted(metrics): svalue = str(value) svalue.replace('\n', ' ') logger.info("%s: %s", key, svalue) else: logger.info("No labels for generating %s metrics", partition) return model
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), batch_size=128, epochs=400) #140 losses = pd.DataFrame(model.history.history) # this data frame has two columns, one is loss and the other is called # val_loss ---> this is loss on that test set # that validation data and now I can directly compare the loss on training # and loss on test data in order to see if i am overfitting to the training # data on my model. Simply we can plot it losses.plot() #%%% we can do some evaluationon our test data 140 """Evaluation on Test Data""" from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score predictions = model.predict(X_test) mean_absolute_error(y_test, predictions) house['price'].mean() explained_variance_score(y_test, predictions) # Our predictions plt.scatter(y_test, predictions) # Perfect predictions plt.plot(y_test, y_test, 'r')
def plot_predicted_injury_rates(prod_df, accident_df, mines_df): """Use simple features to predict accident rates and plot. """ def get_dates_with_no_nan(df_list): """return a sorted list a index which contain no nans in any columns""" common_index = reduce(iand, [set(x.index) for x in df_list]) for df in df_list: has_nulls = df[df.isnull().any(axis=1)].index for element in has_nulls: if element in common_index: common_index.remove(element) # out.add(list(df[df.isnull().any(axis=1)].index)) return sorted(common_index) def create_features_df(injuries, prod, norm_df): """Create a dataframe of features to predict accident rates.""" grouper = pd.Grouper(key="date", freq="q") # get features from production df prod_cols = ["hours_worked", "employee_count", "coal_production"] prod_features = prod.groupby(grouper)[prod_cols].sum() exp_df = aggregate_descriptive_stats(injuries, "total_experience") size_df = aggregate_descriptive_stats(prod, "employee_count") # prod_per_hour = prod_features['coal_production'] / prod_features['hours_worked'] # prod_features['coal_per_hour'] = prod_per_hour df_list = [exp_df, size_df, prod_features, norm_df] index = get_dates_with_no_nan(df_list) hours = prod_features.loc[index] exp = exp_df.loc[index] size = size_df.loc[index] # drop number of accidents info from exp df exp = exp.drop(columns="count") out = pd.concat([exp, size, hours], keys=["exp", "size", "prod"], axis=1) return out plt.clf() # get features and such prod, mines = get_ug_coal_prod_and_mines(prod_df, mines_df) injuries = accident_df[is_ug_gc_accidents(accident_df, only_injuries=True)] normed = normalize_injuries(injuries, prod, mines) # get experience, mine sizes (by employee count) and hours worked # combine into a feature dataframe feature_df = create_features_df(injuries, prod, normed) norm = normed.loc[feature_df.index] # get GC injury rate (injuries per 10^6 hours) target = norm["hours_worked"] * 1_000_000 # select the most important features select_feats = select_k_best_regression( feature_df, target, k=5, normalize=True, ) X = select_feats.values reg = LinearRegression(normalize=True).fit(X, target.values) x_pred = reg.predict(X) rmse = mean_squared_error(target.values, x_pred, squared=False) explained_var = explained_variance_score( target.values, x_pred, ) # now plot plt.figure(figsize=(5.5, 3.5)) plt.plot(target.index, target.values, color="b", label="GC injury rate") plt.plot(select_feats.index, x_pred, color="r", label="predicted injury rate") plt.legend() plt.xlabel("Year") plt.ylabel("GC Injures per $10^6$ Hours") return plt
y_test.to_csv('y_test.csv') # In[ ]: ypred_df = pd.DataFrame(ypred) # In[352]: ypred_df.to_csv('ypred.csv') # In[92]: from sklearn.metrics import explained_variance_score explained_variance_score(y_test, ypred) # In[93]: from sklearn.metrics import max_error max_error(y_test, ypred) # In[94]: from sklearn.metrics import r2_score r2 = r2_score(y_test, ypred) r2 # In[95]:
data = pd.read_csv('./data/employee-perf.csv') data_x = data[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']] data_y = data['Annual Performance Rating'] model = linear_model.LinearRegression() x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 4) model.fit(x_train,y_train) preds = model.predict(x_test) pprint.pprint(pd.DataFrame({'Actual':y_test, 'Predicted':preds})) Actual Predicted 3 90 88.640209 4 85 81.412110 6 94 93.320892 print('MSE, MAE, R^2, EVS: ' + str([mean_squared_error(y_test, preds), ...: median_absolute_error(y_test, preds), ...: r2_score(y_test, preds), ...: explained_variance_score(y_test, preds)])) MSE, MAE, R^2, EVS: [5.0610589164729705, 1.3597910272418403, 0.62664319468642016, 0.8861576085020817] #reading in the new modified employee-perf data2= pd.read_csv('./data/employee-perf2.csv') #predicting the performance score for data2 based on the performance score of the first data data_x = data[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']] data_y= data['Annual Performance Rating'] predict_vars = data2[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']] model = linear_model.LinearRegression() model.fit(x_train,y_train) preds = model.predict(predict_vars) # the numbers are slightly off here from what I had in my answer document but
def main(): horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses races98 = RaceParserNoHandicaps('./../Data/born98.csv').races races05 = RaceParserNoHandicaps('./../Data/born05.csv').races print ''' HorsesBorn98 Dataset ''' horses_train_98, horses_test_98 = split_dataset(horses98) horses_98_X_train = [] horses_98_y_train = [] for h in horses_train_98: v,s = compute_vector(h) horses_98_X_train.append(v) horses_98_y_train .append(s) print 'No. of instances in training set:' print len(horses_98_X_train) print len(horses_98_y_train) print '' horses_98_X_test = [] horses_98_y_test = [] for h in horses_test_98: v,s = compute_vector(h) horses_98_X_test.append(v) horses_98_y_test.append(s) print 'No. of instances in testing set:' print len(horses_98_X_test) print len(horses_98_y_test) print '' # Create linear regression object regr98 = linear_model.LinearRegression() # Train the model using the training sets regr98.fit(horses_98_X_train, horses_98_y_train) # Coefficients print 'Coefficients:' print regr98.coef_ print '' # Explained variance score: 1 is perfect prediction print 'Variance score:' print regr98.score(horses_98_X_test, horses_98_y_test) print '' print 'Mean absolute error:' print mean_absolute_error(horses_98_y_test, (regr98.predict(horses_98_X_test))) print '' print 'Explained variance:' print explained_variance_score(horses_98_y_test, (regr98.predict(horses_98_X_test))) print '' print 'Mean squared error:' print mean_squared_error(horses_98_y_test, (regr98.predict(horses_98_X_test))) print '' print ''' HorsesBorn05 Dataset ''' horses_train_05, horses_test_05 = split_dataset(horses05) horses_05_X_train = [] horses_05_y_train = [] for h in horses_train_05: v,s = compute_vector(h) horses_05_X_train.append(v) horses_05_y_train .append(s) print 'No. of instances in training set:' print len(horses_05_X_train) print len(horses_05_y_train) print '' horses_05_X_test = [] horses_05_y_test = [] for h in horses_test_05: v,s = compute_vector(h) horses_05_X_test.append(v) horses_05_y_test.append(s) print 'No. of instances in testing set:' print len(horses_05_X_test) print len(horses_05_y_test) print '' # Create linear regression object regr05 = linear_model.LinearRegression(fit_intercept=True) # Train the model using the training sets regr05.fit(horses_05_X_train, horses_05_y_train) # Coefficients print 'Coefficients:' print regr05.coef_ print '' # Explained variance score: 1 is perfect prediction print 'Variance score:' print regr05.score(horses_05_X_test, horses_05_y_test) print '' print 'Mean absolute error:' print mean_absolute_error(horses_05_y_test, (regr05.predict(horses_05_X_test))) print '' print 'Explained variance:' print explained_variance_score(horses_05_y_test, (regr05.predict(horses_05_X_test))) print '' print 'Mean squared error:' print mean_squared_error(horses_05_y_test, (regr05.predict(horses_05_X_test))) print '' print 'R2 score:' print r2_score(horses_05_y_test, (regr05.predict(horses_05_X_test))) print '' print 'Mean absolute error based on training set:' print mean_absolute_error(horses_05_y_train, (regr05.predict(horses_05_X_train))) print '' # Plots horses_98_y_pred = regr98.predict(horses_98_X_test) horses_05_y_pred = regr05.predict(horses_05_X_test) plot_speeds(horses_98_y_pred, 'r', 'Predicted Speeds for Horses1998 Test Set') plot_speeds(horses_98_y_test, 'r', 'Actual Speeds for Horses1998 Test Set') plot_speeds(horses_05_y_pred, 'b', 'Predicted Speeds for Horses2005 Test Set') plot_speeds(horses_05_y_test, 'b', 'Actual Speeds for Horses2005 Test Set')
def regression_test(test_data): data = pd.read_csv( "./Data-assignment-1/Traffic_flow/traffic_flow_data.csv") cols = [] # for i in range((data.shape[1]) // 45): # for j in range(5): # cols.append(45 * i + j + 20) # y = data['Segment23_(t+1)'] # X = data[data.columns[cols]] # # get every segment 23 # y = data['Segment23_(t+1)'] # data = data.iloc[:, 22::45] # X = data # print(data.shape[1]) # new = data.copy() # for i in range(45, data.shape[1] - 1): # new[new.columns[i]] = data[data.columns[i]] - data[data.columns[i - 45]] # data = new[data.columns[44:]] X, y = data.drop('Segment23_(t+1)', axis=1), data['Segment23_(t+1)'] X_test, y_test = test_data.drop('Segment23_(t+1)', axis=1), test_data['Segment23_(t+1)'] X_train, a, y_train, a = train_test_split(X, y, test_size=0.2, random_state=42) scaler = preprocessing.StandardScaler().fit(X_train) X_train, X_test = scaler.transform(X_train), scaler.transform(X_test) tuned_parameters = [{'alpha': [i**2 / 100 for i in range(1, 100, 2)]}] model = Ridge() scoring = {'r2': 'r2'} #''mean_squared_error': 'neg_mean_squared_error' grid = GridSearchCV(model, tuned_parameters, scoring=scoring, refit='r2') grid.fit(X_train, y_train) print(grid.param_grid) results = grid.cv_results_ graph('Traffic Flow', ['alpha', 'Score'], results, scoring, 'alpha') best = grid.best_estimator_ # best = model.fit(X_train, y_train) predictions = best.predict(X_test) print( '*******************************************************************') print("Ridge Regression Traffic Flow") print("Mean squared error: {}".format( mean_squared_error(y_test, predictions))) print("Explained variance: {}".format( explained_variance_score(y_test, predictions))) tuned_parameters = {} model = LinearRegression() grid = GridSearchCV(model, tuned_parameters) grid.fit(X_train, y_train) best = grid.best_estimator_ # best = model.fit(X_train, y_train) predictions = best.predict(X_test) print( '*******************************************************************') print("Linear Regression Traffic Flow") print("Mean squared error: {}".format( mean_squared_error(y_test, predictions))) print("Explained variance: {}".format( explained_variance_score(y_test, predictions)))
"SVMReg.", "ForestReg.", ] classifiers = [ KNeighborsRegressor(n_neighbors=1, algorithm="auto"), DecisionTreeRegressor(max_depth=5, splitter='best'), MLPRegressor(alpha=1, max_iter=1000), SVR(C=1.0, epsilon=0.2), RandomForestRegressor(n_estimators=100, random_state=0), ] #Comparem les seguents característiques: results = pd.DataFrame( index=['Absolute Error', 'Variance Score', 'Train Cost', 'Test Cost'], columns=names) for name, clf in zip(names, classifiers): t1 = time.time() clf.fit(X_train, y_train) t2 = time.time() y_pred = clf.predict(X_test) t3 = time.time() results.at['Train Cost', name] = round(t2 - t1, 3) results.at['Test Cost', name] = round(t3 - t2, 3) results.at['Absolute Error', name] = mean_absolute_error(y_test, y_pred) results.at['Variance Score', name] = explained_variance_score(y_test, y_pred) print('Results of Regression Classifiers') print(results)
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test): import os import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import explained_variance_score, max_error from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.metrics import mean_squared_log_error, median_absolute_error from sklearn.metrics import r2_score, mean_poisson_deviance from sklearn.metrics import mean_gamma_deviance import tempfile with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run: # Create model, train it, and create predictions rf = RandomForestRegressor(**params) rf.fit(X_train, y_train) predictions = rf.predict(X_test) # Log model mlflow.sklearn.log_model(rf, "random-forest-model") # Log params [mlflow.log_param(param, value) for param, value in params.items()] # Create metrics exp_var = explained_variance_score(y_test, predictions) max_err = max_error(y_test, predictions) mae = mean_absolute_error(y_test, predictions) mse = mean_squared_error(y_test, predictions) rmse = mean_squared_error(y_test, predictions, squared=False) mslogerror = mean_squared_log_error(y_test, predictions) medianae = median_absolute_error(y_test, predictions) r2 = r2_score(y_test, predictions) mean_poisson = mean_poisson_deviance(y_test, predictions) mean_gamma = mean_gamma_deviance(y_test, predictions) # Print metrics print(" explained variance: {}".format(exp_var)) print(" max error: {}".format(max_err)) print(" mae: {}".format(mae)) print(" mse: {}".format(mse)) print(" rmse: {}".format(rmse)) print(" mean square log error: {}".format(mslogerror)) print(" median abosulte error: {}".format(medianae)) print(" R2: {}".format(r2)) print(" mean poisson deviance: {}".format(mean_poisson)) print(" mean gamma deviance: {}".format(mean_gamma)) # Log metrics mlflow.log_metric("explained variance", exp_var) mlflow.log_metric("max error", max_err) mlflow.log_metric("mae", mae) mlflow.log_metric("mse", mse) mlflow.log_metric("rmse", rmse) mlflow.log_metric("mean square log error", mslogerror) mlflow.log_metric("median abosulte error", medianae) mlflow.log_metric("R2", r2) mlflow.log_metric("mean poisson deviance", mean_poisson) mlflow.log_metric("mean gamma deviance", mean_gamma) # Create feature importance importance = pd.DataFrame(list( zip(df_pits_races_4_model_encoded.columns, rf.feature_importances_)), columns=["Feature", "Importance" ]).sort_values("Importance", ascending=False) # Log importances using a temporary file temp = tempfile.NamedTemporaryFile(prefix="feature-importance-", suffix=".csv") temp_name = temp.name try: importance.to_csv(temp_name, index=False) mlflow.log_artifact(temp_name, "feature-importance.csv") finally: temp.close() # Delete the temp file # Create plot fig, ax = plt.subplots() sns.residplot(predictions, y_test.values.ravel(), lowess=False) plt.xlabel("Predicted values pit duration") plt.ylabel("Residual") plt.title("Residual Plot for pitting") # Log residuals using a temporary file temp = tempfile.NamedTemporaryFile(prefix="residuals_pit_model", suffix=".png") temp_name = temp.name try: fig.savefig(temp_name) mlflow.log_artifact(temp_name, "residuals_pit_model.png") finally: temp.close() # Delete the temp file display(fig) return run.info.run_uuid
best_thetas.append(theta) (lm().fit(X_train, y_train)).coef_ y_predict_50 = X_test.dot(best_thetas[0]) y_predict_2000 = X_test.dot(best_thetas[1]) y_predict_10000 = X_test.dot(best_thetas[2]) for i in range(len(best_thetas)): print(f'minibatch size: {minibatch_size[i]}') print(f'Coefficients: {best_thetas[i]}') print("\n") print("Holdout mean squared error: %.2f" % metrics.mean_squared_error(y_test, X_test.dot(best_thetas[i]))) print("Holdout explained variance: %.2f" % metrics.explained_variance_score(y_test, X_test.dot(best_thetas[i]))) print("Holdout r-squared: %.2f" % metrics.r2_score(y_test, X_test.dot(best_thetas[i]))) print("\n") for epoch in range(n_iterations): shuffled_indices = np.random.permutation(m) X_b_shuffled = X_train[shuffled_indices] y_shuffled = y_train[shuffled_indices] for i in range(0, m, minibatch_size): xi = X_b_shuffled[i:i + minibatch_size] yi = y_shuffled[i:i + minibatch_size] gradients = 2 / minibatch_size * np.asarray(xi).T.dot( xi.dot(theta) - yi) theta = theta - eta * gradients theta_path_mgd.append(theta)
dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(x_train, y_train) # Lets boost decision tree's performance with AdaBoost with # estimators as 400 and random_state as 7 ab_regressor = AdaBoostRegressor(dt_regressor, n_estimators=400, random_state=7) ab_regressor.fit(x_train, y_train) # Performance of decision tree regressor y_pred_dt = dt_regressor.predict(x_test) mse = sm.mean_squared_error(y_test, y_pred_dt) evs = sm.explained_variance_score(y_test, y_pred_dt) print("\n#### Decision Tree performance ####") print("Mean squared error =", round(mse, 2)) print("Explained variance score =", round(evs, 2)) # Performance of decision tree regressor with Adaboost y_pred_dt = ab_regressor.predict(x_test) mse = sm.mean_squared_error(y_test, y_pred_dt) evs = sm.explained_variance_score(y_test, y_pred_dt) print("\n#### Decision Tree performance with Adaboost ####") print("Mean squared error =", round(mse, 2)) print("Explained variance score =", round(evs, 2)) # Feature importance
plt.plot(X_test, y_test_pred, color='black', linewidth=4) plt.xticks(()) plt.yticks(()) plt.show() # Measure performance import sklearn.metrics as sm print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)) print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2)) print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2)) print("Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)) print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2)) # Model persistence import pickle as pickle output_model_file = '3_model_linear_regr.pkl' with open(output_model_file, 'wb') as f: pickle.dump(linear_regressor, f) with open(output_model_file, 'rb') as f: model_linregr = pickle.load(f) y_test_pred_new = model_linregr.predict(X_test) print("\nNew mean absolute error =",