def mean_absolute_scaled_error_year_avg(joined_data, historical_data, weather_variable, years_back=19): """ This function considers as "last period" the avg temperature, on the same date at the same time, over the years present in the historical data (currently up to 19 years for London) In case the historical data does not contain the data point for any of the datetimes considered, the mae is not calculated and np.nan is returned TODO: instead of stopping the entire calculation, just discard the offending data point and continue calculation with rest of data """ def previous_years_avg(dt): date_time = datetime.fromtimestamp(dt, tz=timezone.utc) return np.average([ historical_data[weather_variable][n_years_ago(date_time, n)] for n in range(1, years_back + 1) ]) try: joined_data_without_29_feb = remove_29_feb(joined_data) naive_prediction = [ previous_years_avg(dt) for dt in joined_data_without_29_feb['dt'] ] return [ mae(joined_data_without_29_feb[weather_variable], joined_data_without_29_feb[f't{i}']) / mae(joined_data_without_29_feb[weather_variable], naive_prediction) for i in range(5, 0, -1) ] except KeyError as err: print(f"{err} not found in historical data") return np.nan
def get_results_from_models(model, noises): results = [["noises"] + noises, ["mae_alpha"], ["stdae_alpha"], ["mae_beta"], ["stdae_beta"], ["r_alpha"], ["r_beta"]] for i in range(len(noises)): noise = noises[i] x_valid, y_valid = generate_synthetic_validation_data(noise) x_valid = x_valid.reshape(x_valid.shape[0], x_valid.shape[3], x_valid.shape[2], x_valid.shape[1]) # load weights into new model model.load_weights("Weights/BrainCNNWeights_noise_" + str(noise) + ".h5") print("Loaded model from disk") preds = model.predict(x_valid) results[1].append("{0:.2f}".format(100 * mae(preds[:, 0], y_valid[:, 0]))) results[2].append("{0:.2f}".format( 100 * std(abs(y_valid[:, 0] - preds[:, 0])))) results[3].append("{0:.2f}".format(100 * mae(preds[:, 1], y_valid[:, 1]))) results[4].append("{0:.2f}".format( 100 * std(abs(y_valid[:, 1] - preds[:, 1])))) results[5].append("{0:.2f}".format(pearsonr(preds[:, 0], y_valid[:, 0]))) results[6].append("{0:.2f}".format(pearsonr(preds[:, 1], y_valid[:, 1]))) display(HTML(tabulate.tabulate(results, tablefmt='html')))
def predict(self, X, treatment=None, y=None): """Predict treatment effects. Args: X (np.matrix): a feature matrix treatment (np.array): a treatment vector y (np.array, optional): an optional outcome vector Returns: (numpy.ndarray): Predictions of treatment effects. """ yhat_c = self.model_c.predict(X) yhat_t = self.model_t.predict(X) if (y is not None) and (treatment is not None): is_treatment = treatment != self.control_name logger.info('RMSE (Control): {:.6f}'.format( np.sqrt(mse(y[~is_treatment], yhat_c[~is_treatment])))) logger.info(' MAE (Control): {:.6f}'.format( mae(y[~is_treatment], yhat_c[~is_treatment]))) logger.info('RMSE (Treatment): {:.6f}'.format( np.sqrt(mse(y[is_treatment], yhat_t[is_treatment])))) logger.info(' MAE (Treatment): {:.6f}'.format( mae(y[is_treatment], yhat_t[is_treatment]))) return (yhat_t - yhat_c).reshape(-1, 1)
def eval(model, data, set_name, denorm_predictions=True): # predictions predictions = model.predict(data.dataset(set_name)) labels = data.raw_data(set_name)["labels"][:len(predictions)] predictions = pd.DataFrame(data=predictions, index=labels.index, columns=labels.columns) if denorm_predictions: predictions = data.denormalize_labels(predictions) # results results = { "general": { "mae": mae(labels, predictions), "mape": mape(labels, predictions), "mse": mse(labels, predictions) } } for col in labels.columns: results[col] = { "mae": mae(labels[col], predictions[col]), "mape": mape(labels[col], predictions[col]), "mse": mse(labels[col], predictions[col]), "tend_acc": tendency_accuracy(labels[col], predictions[col]) } return predictions, results
def test_automl(): X, y = make_regression(n_samples=N_OBS, n_features=N_FEATURE, n_informative=N_IMP_FEATURE, random_state=RANDOM_SEED) X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])]) y = pd.Series(y) logging.info(f'X dim: {X.shape}, y dim: {y.shape}') X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED) model = AutoLGB(objective='regression', metric='l1') model.tune(X_trn, y_trn) model.fit(X_trn, y_trn) p = model.predict(X_tst) r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min()) logging.info(f'MAE (LGB): {mae(y_tst, p):.4f}') assert mae(y_tst, p) < mae(y_tst, r) model = AutoXGB(objective='reg:linear', metric='rmse') model.tune(X_trn, y_trn) model.fit(X_trn, y_trn) p = model.predict(X_tst) r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min()) logging.info(f'MAE (XGB): {mae(y_tst, p):.4f}') assert mae(y_tst, p) < mae(y_tst, r)
def main(): w2v_sim = sim('w2v_vecs.npy') w2vs_sim = sim('w2vs_vecs.npy') data = np.loadtxt('data/wordsim353/combined.csv', skiprows=1, delimiter=',', dtype=np.str) comps = 0 w2v_sims = [] w2vs_sims = [] gt_sims = [] for w1, w2, gt_sim in data: if w1 in w2v_sim and w2 in w2v_sim and w1 in w2vs_sim and w2 in w2vs_sim: comps += 1 w2v_sims.append(w2v_sim(w1, w2)) w2vs_sims.append(w2vs_sim(w1, w2)) gt_sims.append(float(gt_sim) / 10) print('word2vec mse:', mse(w2v_sims, gt_sims)) print('word2vecS mse:', mse(w2vs_sims, gt_sims)) print('word2vec mae:', mae(w2v_sims, gt_sims)) print('word2vecS mae:', mae(w2vs_sims, gt_sims)) print(comps, 'comparisons out of 353')
def predict(self, X, treatment, y=None): """Predict treatment effects. Args: X (np.matrix): a feature matrix treatment (np.array): a treatment vector y (np.array, optional): an outcome vector Returns: (numpy.ndarray): Predictions of treatment effects. """ is_treatment = treatment != self.control_name w = is_treatment.astype(int) X = np.hstack((w.reshape((-1, 1)), X)) X[:, 0] = 0 # set the treatment column to zero (the control group) yhat_c = self.model.predict(X) X[:, 0] = 1 # set the treatment column to one (the treatment group) yhat_t = self.model.predict(X) if y is not None: logger.info('RMSE (Control): {:.6f}'.format( np.sqrt(mse(y[~is_treatment], yhat_c[~is_treatment])))) logger.info(' MAE (Control): {:.6f}'.format( mae(y[~is_treatment], yhat_c[~is_treatment]))) logger.info('RMSE (Treatment): {:.6f}'.format( np.sqrt(mse(y[is_treatment], yhat_t[is_treatment])))) logger.info(' MAE (Treatment): {:.6f}'.format( mae(y[is_treatment], yhat_t[is_treatment]))) return (yhat_t - yhat_c).reshape(-1, 1)
def evaluate(df, num_points, test=False): print('\n ----------------- MODEL EVALUATION ----------------- \n') df.fillna(0) open_true = df['open_next_day'] open_pred = df['pred_open_next_day'] close_true = df['close_next_day'] close_pred = df['pred_close_next_day'] if test: open_true = open_true[:-1] open_pred = open_pred[:-1] close_true = close_true[:-1] close_pred = close_pred[:-1] fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 8)) ax[0, 0].plot(open_true[-num_points:], open_pred[-num_points:], 'go') ax[0, 0].set_title('Open') ax[0, 1].plot(close_true[-num_points:], close_pred[-num_points:], 'r^') ax[0, 1].set_title('Close') ax[1, 0].plot(open_true[-num_points:]) ax[1, 0].plot(open_pred[-num_points:]) ax[1, 0].set_label(['true', 'prediction']) ax[1, 1].plot(close_true[-num_points:]) ax[1, 1].plot(close_pred[-num_points:]) ax[1, 1].set_label(['true', 'prediction']) fig.suptitle('Model Price Predictions') plt.show() plt.close() mae_open = mae(open_true, open_pred) mae_close = mae(close_true, close_pred) mse_open = mse(open_true, open_pred) mse_close = mse(close_true, close_pred) r2_open = r2(open_true, open_pred) r2_close = r2(close_true, close_pred) print('OPEN PRICES') print('\t Mean Absolute Error: {}'.format(mae_open)) print('\t Mean Squared Error: {}'.format(mse_open)) print('\t R2 Score: {}'.format(r2_open)) print('CLOSE PRICES') print('\t Mean Absolute Error: {}'.format(mae_close)) print('\t Mean Squared Error: {}'.format(mse_close)) print('\t R2 Score: {}'.format(r2_close)) print('')
def plot_lin_regr(self, X_dataset, y_dataset, X_axis, y_axis, color, label, percentual_X=False, percentual_y=False, y_axis_min=None, y_axis_max=None): if percentual_X: X = X_dataset * 100 else: X = X_dataset X = X.reshape(len(X), 1) X_lr = X.reshape(-1, bac.NUM_RUNS)[:,:bac.NUM_SAMPLES].reshape(-1, 1) # Samples used to estimate the Linear Regression X_line = [[i] for i in range(0, self.x_max + bac.X_MAX_PADDING)] # Used to plot streched estimated line if percentual_y: y = y_dataset * 100 else: y = y_dataset y = y.reshape(len(y), 1) y_lr = y.reshape(-1, bac.NUM_RUNS)[:,:bac.NUM_SAMPLES].reshape(-1, 1) # Samples used to estimate the Linear Regression regr = lm.LinearRegression() regr.fit(X_lr, y_lr) if y_axis == 1: # Use secondary y axis on X_axis tmp_plot, = self.ax2[X_axis].plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" + # r"$R^2: " + str(regr.score(X, y)) + "$\n" # r"$MAE: " + str(mae(y, regr.predict(X))) + "$") print(label + " R^2: " + str(regr.score(X, y))) print(label + " MAE: " + str(mae(y, regr.predict(X)))) if y_axis_min != None: self.y_axis_min_values[self.ax2[X_axis]] = y_axis_min if y_axis_max != None: self.y_axis_max_values[self.ax2[X_axis]] = y_axis_max else: # Use primary y axis on X_axis if isinstance(self.axarr, np.ndarray): tmp_plot, = self.axarr[X_axis].plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" + # r"$R^2: " + str(regr.score(X, y)) + "$\n" # r"$MAE: " + str(mae(y, regr.predict(X))) + "$") print(label + " R^2: " + str(regr.score(X, y))) print(label + " MAE: " + str(mae(y, regr.predict(X)))) if y_axis_min != None: self.y_axis_min_values[self.axarr[X_axis]] = y_axis_min if y_axis_max != None: self.y_axis_max_values[self.axarr[X_axis]] = y_axis_max else: tmp_plot, = self.axarr.plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" + # r"$R^2: " + str(regr.score(X, y)) + "$\n" # r"$MAE: " + str(mae(y, regr.predict(X))) + "$") print(label + " R^2: " + str(regr.score(X, y))) print(label + " MAE: " + str(mae(y, regr.predict(X)))) if y_axis_min != None: self.y_axis_min_values[self.axarr] = y_axis_min if y_axis_max != None: self.y_axis_max_values[self.axarr] = y_axis_max self.plots.append(tmp_plot)
def predict(self, X, treatment=None, y=None, return_components=False, verbose=True): """Predict treatment effects. Args: X (np.matrix): a feature matrix treatment (np.array): a treatment vector y (np.array, optional): an optional outcome vector Returns: (numpy.ndarray): Predictions of treatment effects. """ yhat_cs = {} yhat_ts = {} for group in self.t_groups: w = (treatment != group).astype(int) X_new = np.hstack((w.reshape((-1, 1)), X)) model_c = self.models_c[group] model_t = self.models_t[group] yhat_cs[group] = model_c.predict(X_new) yhat_ts[group] = model_t.predict(X_new) if (y is not None) and (treatment is not None) and verbose: for group in self.t_groups: logger.info('Error metrics for {}'.format(group)) logger.info('RMSE (Control): {:.6f}'.format( np.sqrt( mse(y[treatment != group], yhat_cs[group][treatment != group])))) logger.info(' MAE (Control): {:.6f}'.format( mae(y[treatment != group], yhat_cs[group][treatment != group]))) logger.info('RMSE (Treatment): {:.6f}'.format( np.sqrt( mse(y[treatment == group], yhat_ts[group][treatment == group])))) logger.info(' MAE (Treatment): {:.6f}'.format( mae(y[treatment == group], yhat_ts[group][treatment == group]))) te = np.zeros((X.shape[0], self.t_groups.shape[0])) for i, group in enumerate(self.t_groups): te[:, i] = yhat_ts[group] - yhat_cs[group] if not return_components: return te else: return te, yhat_cs, yhat_ts
def mase(y_pred, y_true, method='naive', X_test=None, constant=None): """ Mean absolute scaled error. MAE error of your predictions, normalized by MAE error of different methods predictions. Parameters ----------- y_pred : sequence Predictions you want to compare to with different methods. y_true: sequence True values method: {'naive', 'exp_smooth', 'mean', 'median', 'constant'} The method used to generate y_method which is predictions to compare to predictions of your method X_test: pd.Dataframe object, optional Must be provided when using all methods but naive and constant constant: int, optional Must be provided if method arg is set to constant Returns -------- mase_score : range(0,1) The score, that is computed as following - mae(y_true, y_pred)/mae(y_true, y_method). For example if method is 'naive' and mase score is 0.25, that means that your method is 4 times more accurate, then the naive one. """ y_method = y_pred if method is 'naive': y_method = y_true.shift() y_method.fillna(y_method.mean(), inplace=True) if method is not 'naive': if X_test is None: print('You should provide X_test to evaluate predict') X_test.drop([label for label in X_test.columns if 'lag_' in label], inplace=True, axis=1) if method is 'exp_smooth': num_lags = len(X_test.columns) y_method = [ hw.additive(list(lags[1].values), num_lags, 1)[0][0] for lags in X_test.iterrows() ] if method is 'mean': y_method = X_test.mean(axis=1).values if method is 'median': y_method = X_test.mean(axis=1).values if method is 'constant': y_method = np.full(y_true.shape, constant) return mae(y_true, y_pred) / mae(y_true, y_method) # todo fix division by zero
def get_results(y_test, y_pred): y_test_spike, y_pred_spike, y_test_normal, y_pred_normal = split_regions( y_test, y_pred) return { "rmse_general": mse(y_test, y_pred, squared=False), "mae_general": mae(y_test, y_pred), "rmse_spike": mse(y_test_spike, y_pred_spike, squared=False), "mae_spike": mae(y_test_spike, y_pred_spike), "rmse_normal": mse(y_test_normal, y_pred_normal, squared=False), "mae_normal": mae(y_test_normal, y_pred_normal), }
def get_ave_metrics(predictions,name): mse_train_list = [] mae_train_list = [] r_train_list = [] mse_test_list = [] mae_test_list = [] r_test_list = [] for ytr,tr_pred,yte,te_pred in zip(predictions['ytr'],predictions['tr_preds'], predictions['yte'],predictions['te_preds']): mse_train_list.append(mse(ytr,tr_pred)) mae_train_list.append(mae(ytr,tr_pred)) r_train_list.append(pearsonr(ytr,tr_pred)[0]) mse_test_list.append(mse(yte,te_pred)) mae_test_list.append(mae(yte,te_pred)) r_test_list.append(pearsonr(yte,te_pred)[0]) results = {'mse_train_ave':0, 'mse_train_std':0, 'mae_train_ave':0, 'mae_train_std':0, 'pearsonr_train_ave':0, 'pearsonr_train_std':0, 'mse_test_ave':0, 'mse_test_std':0, 'mae_test_ave':0, 'mae_test_std':0, 'pearsonr_test_ave':0, 'pearsonr_test_std':0} results['mse_train_ave']=np.average(mse_train_list) results['mse_train_std']=np.std(mse_train_list) results['mae_train_ave']=np.average(mae_train_list) results['mae_train_std']=np.std(mae_train_list) r_train_list = np.array(r_train_list).reshape(-1) results['pearsonr_train_ave']=np.average(r_train_list) results['pearsonr_train_std']=np.std(r_train_list) results['mse_test_ave']=np.average(mse_test_list) results['mse_test_std']=np.std(mse_test_list) results['mae_test_ave']=np.average(mae_test_list) results['mae_test_std']=np.std(mae_test_list) r_test_list = np.array(r_test_list).reshape(-1) results['pearsonr_test_ave']=np.average(r_test_list) results['pearsonr_test_std']=np.std(r_test_list) return pd.DataFrame(results,index=[name]).T
def plot_ts(country): ''' plot out the y_true vs y_pred, given the country, all_data, and all_models ''' version_ = re.sub("\.", "_", str(MODEL_VERSION)) all_data, all_models = pickle.load( open(os.path.join("models", f"all_data_model-{version_}.pickle"), "rb")) y_true = all_data[country]['y'] y_pred = all_models[country].predict(all_data[country]['X']) all_dates = all_data[country]['dates'] rmse_ = round(mse(y_true, y_pred, squared=False), 2) mae_ = round(mae(y_true, y_pred), 2) mape_ = round(mape(y_true, y_pred), 2) # fig = go.Figure() # fig.add_trace(go.Scatter(x=all_dates, y=y_true, name='Actual Revenue')) # fig.add_trace(go.Scatter(x=all_dates, y=y_pred, name='Predicted Revenue')) # # fig.update_layout(title=f"{country.replace('_',' ').title()}: RMSE:{rmse_}, MAE:{mae_}, MAPE:{mape_}%", # yaxis_title="Revenue") # fig.show() plt.figure(figsize=(12, 4)) plt.title( f"Model for {country.replace('_',' ').title()}: RMSE:{rmse_}, MAE:{mae_}, MAPE:{mape_}%" ) plt.plot(pd.to_datetime(all_dates), y_true, label='Actual Revenue') plt.plot(pd.to_datetime(all_dates), y_pred, label='Predict Revenue') plt.legend() plt.show()
def model_evaluation_rdg(y_test, y_pred_rdg): from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score print("\n---- Ridge Regressionn - Model Evaluation ----") print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_rdg))) print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_rdg))) print("Root Mean Squared Error (RMSE): {}".format( np.sqrt(mse(y_test, y_pred_rdg))))
def metric(actual, predicted): e_mse = mse(actual, predicted) e_mae = mae(actual, predicted) e_r2 = r2(actual, predicted) e_agm = ((sqrt(e_mse) + e_mae) / 2) * (1 - e_r2) return e_mse, sqrt(e_mse), e_mae, e_r2, e_agm
def model_eval(y_data, x_data, model): y = [] yhat = [] for i in range(len(y_data)): y.append(y_data[i]) yhat.append(float(model.predict([[x_data[i]]]))) return mae(y, yhat)
def model_creation(data,labels,features): logging.info('='*40) X=data[features] best_maes=[] for label in labels: logging.info('-'*40) y=data[label] logging.info('Beginning model testing for label {0}. {1}% Complete.'.format(label)) #best_model=None best_model_mae=999999999 for i in range(25): # previously 15 train_X,val_X,train_y,val_y=tts(X,y) model=RandomForestRegressor() model.fit(train_X,train_y) val_predictions=model.predict(val_X) val_mae=mae(val_predictions,val_y) if val_mae<best_model_mae: #best_model=model best_model_mae=val_mae logging.info('**New best model achieved below. Iteration #{}'.format(i)) logging.info('Validation MAE: {:,.2f}'.format(val_mae)) best_maes.append(best_model_mae) return best_maes
def train_model(x_data, y_data, k=5): models = [] scores = [] k_fold = KFold(n_splits=k, shuffle=True, random_state=123) for train_idx, val_idx in k_fold.split(x_data): x_train, y_train = x_data[train_idx, :], y_data[train_idx] x_val, y_val = x_data[val_idx, :], y_data[val_idx] d_train = lgbm.Dataset(data=x_train, label=y_train) d_val = lgbm.Dataset(data=x_val, label=y_val) params = { 'n_estimators': 5000, 'learning_rate': 0.8, 'max_depth': 5, 'boosting_type': 'dart', 'drop_rate': 0.3, 'objective': 'regression', 'metric': 'mae', 'is_training_metric': True, 'num_leaves': 200, 'colsample_bytree': 0.7, 'subsample': 0.7 } wlist = {'train': d_train, 'eval': d_val} model = lgbm.train(params=params, train_set=d_train, valid_sets=d_val, evals_result=wlist) models.append(model) scores.append(mae(y_val, model.predict(x_val))) return models[np.argmin(scores)]
def calculate_metrics(target, pred, bins=0, returns=False): """Calculate following metrics: * MAE * MAPE * Percentage of error less than 30% Parameters ---------- target : list or np.array array with answers pred : list or np.array array with predictions bins : int number of bins in histogram, if 0 the histogram will not be displayed returns : bool if True, metrics will be returned """ mape = np.abs(target - pred) / target mae_val = mae(target, pred) perc = np.mean(mape < 0.3) * 100 print('MAE: {:.4}'.format(mae_val)) print('MAPE: {:.4}'.format(np.mean(mape))) print('Percentage of error less than 30%: {:.4}%'.format(perc)) if bins: plt.figure(figsize=(8, 6)) sns.distplot(mape, bins=bins) plt.title('MAPE hist') plt.show() if returns: return np.mean(mape), mae_val, perc return None
def QC_info(fg_hist, bg_hist, out_filename): """ Compute QC metrics and print info to out_filename. """ mean_abs_error = mae(fg_hist, bg_hist) chi_stat, chi_pval = power_div(fg_hist, bg_hist) gof_stat, gof_pval = power_div(fg_hist, bg_hist, "cressie-read") gte_stat, gte_pval = power_div(fg_hist, bg_hist, "log-likelihood") with open(out_filename, 'w') as stream: if sum(fg_hist) < 1000 or sum(bg_hist) < 1000: stream.write("QC tests cannot be ") stream.write("computed due to a small number of samples ") stream.write("(less than 1000).\n") elif chi_pval == -1 or gof_pval == -1: stream.write("QC tests cannot be ") stream.write("computed due to a large number of values ") stream.write("with low frequencies (more than ") stream.write("20% of values <=5).\n") else: stream.write("mean_absolute_error\t%f\n" % mean_abs_error) stream.write("chi-square(statistic, pvalue)\t(%f, %f)\n" % (chi_stat, chi_pval)) stream.write("cressie-read goodness_of_fit(statistic, pvalue)") stream.write("\t(%f, %f)\n" % (gof_stat, gof_pval)) stream.write("G-test goodness_of_fit(statistic, pvalue)") stream.write("\t(%f, %f)\n" % (gte_stat, gte_pval))
def eval_reg(y_test, predictions): ''' Function: Evaluates a regression model through its main metrics ''' print("### MEASURES OF REGRESSION MODEL ###") print("------------------------------------\n") print("R2 = {0:.4f}\n".format(r2_score(y_test, predictions))) # R2 print("RMSE = {0:.4f}\n".format(mse( y_test, predictions, squared=False))) # Root Mean Squared Error print("MSE = {0:.4f}\n".format(mse(y_test, predictions, squared=True))) # Mean Squared Error if len(predictions[predictions < 0]) > 0: print( "MSLE not possible to be applied. Predicitons contain negative values.\n" ) else: print("MSLE = {0:.4f}\n".format(msle( y_test, predictions))) # Mean Squared Log Error print("MAE = {0:.4f}\n".format(mae(y_test, predictions))) # Mean Absolute Error print("EVS = {0:.4%}\n".format(evs( y_test, predictions))) # Explained Variance Score
def sklearn_acc(model, test_data, test_target): overall_results = model.predict(test_data) test_pred = (overall_results > 0.5).astype(int) acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target), f1_score(test_pred, test_target, average='macro')] return acc_results
def prediction_eval(prediction, real_data): ''' This functino compute and print four differents metrics (mse ,mae ,r2 and median) to evaluate accuracy of the model prediction and real_data need to have the same size Parameters ---------- prediction : array predicted values. real_data : array real data. Returns ------- None. ''' from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import median_absolute_error as medae from sklearn.metrics import r2_score as r2 print("mean_absolute_error : ", mae(real_data, prediction)) print("mean_squared_error : ", mse(real_data, prediction)) print("median_absolute_error : ", medae(real_data, prediction)) print("r2_score : ", r2(real_data, prediction))
def bmae(pred, true, types): log_maes = [] for utype in np.unique(types): mask = types == utype utype_mae = np.max([mae(true[mask], pred[mask]), 1e-9]) log_maes.append(np.log(utype_mae)) return np.mean(log_maes)
def testing(config_path: Text) -> None: config = yaml.safe_load(open(config_path)) log_target = config["feature_transform"]["log_target"] eval_mae = config["test"]["mean_absolute_error"] eval_r2 = config["test"]["r2_score"] model = load_pickle("stages/model.pkl") X_test = pd.read_csv("stages/X_test.csv") y_test = pd.read_csv("stages/y_test.csv").iloc[:, 0] y_pred = model.predict(X_test) if log_target: y_pred = np.exp(y_pred) y_test = np.exp(y_test) metrics = {} if eval_mae: metrics.update({"mean_absolute_error": mae(y_test, y_pred)}) if eval_r2: metrics.update({"R2": r2_score(y_test, y_pred)}) # metrics.append({"score": "R2", "value": r2_score(y_test, y_pred)}) # pd.DataFrame(metrics).to_csv("stages/metrics.csv", index=False) json.dump(obj=metrics, fp=open("stages/metrics.json", "w"))
def try_fit_predict_RandomForest(train_df, test_df, index_df, savename): X_data = train_df.drop(['scalar_coupling_constant'], axis=1).values.astype('float32') y_data = train_df['scalar_coupling_constant'].values.astype('float32') test_feature = test_df X_train, X_test, y_train, y_test = train_test_split(X_data , y_data , test_size=0.33, random_state=128) # LGBMRegressorによる予測 params = {'n_estimators' : [500], 'n_jobs': [-1]} forest = RandomForestRegressor() model = GridSearchCV(forest, params, cv = 3) print('Start Fitting') model.fit(X_train, y_train) print('Start Getting Mae') prediction_rf_mae = model.predict(X_test) Err = mae(y_test, prediction_rf_mae) acc_dic[savename] = Err print('Start Predicting') prediction_rf = model.predict(test_feature) index_df['scalar_coupling_constant'] = prediction_rf csv_title = 'result_' + savename + '.csv' index_df.to_csv(csv_title) return prediction_rf
def writing_results(self): self.result_file_name = f"{sys.argv[2][0:20]}_results.txt" self.result_file = open(self.result_file_name, 'w+', encoding='utf-8') print("\nResults given in [ppm]:\n") header = "Hydrogen\t%s\t%s\t%10s\t%s\n" % ( u'Theoretical', '\tExperimental', '\tError', '\tRelative error') print(header) for i in range(len(self.computedPeaks)): print(u"%dH\t%19.4f\t%23.4f\t%10.4f\t%13.4f" % (self.atom_numbers[i], self.computedPeaks[i], self.empiricalPeaks[i], self.empiricalPeaks[i] - self.computedPeaks[i], (self.empiricalPeaks[i] - self.computedPeaks[i]) / self.computedPeaks[i])) self.result_file.write( u"%dH\t%19.4f\t%23.4f\t%10.4f\t%13.4f\n" % (self.atom_numbers[i], self.computedPeaks[i], self.empiricalPeaks[i], self.empiricalPeaks[i] - self.computedPeaks[i], (self.empiricalPeaks[i] - self.computedPeaks[i]) / self.computedPeaks[i])) from sklearn.metrics import mean_absolute_error as mae MAE = mae(self.empiricalPeaks, self.computedPeaks) print(f"MAE: {MAE} ppm") self.result_file.close()
def model_scores(model=None, X_train=None, X_test=None, y_train=None, y_test=None, target_scaler=None, scale_target=True): # Erstelle Kopien der Daten, damit sie nicht verändert werden X_train_copy, X_test_copy, y_train_copy, y_test_copy = X_train.copy( ), X_test.copy(), y_train.copy(), y_test.copy() # Berechne Vorhersagen für skalierte oder unskalierte Zielvariable if scale_target: model.fit(X_train_copy, y_train_copy[['target_sc']]) y_test_copy['predict_sc'] = model.predict(X_test_copy) y_test_copy['prediction'] = target_scaler.inverse_transform( y_test_copy['predict_sc']) else: model.fit(X_train_copy, y_train_copy[target]) y_test_copy['prediction'] = model.predict(X_test_copy) # Modellgüte berechnen mape_score = np.mean( np.abs((y_test_copy[target] - y_test_copy['prediction']) / y_test_copy[target])) * 100 mae_score = mae(y_test_copy[target], y_test_copy['prediction']) mse_score = mse(y_test_copy[target], y_test_copy['prediction'], squared=False) R2_score = model.score(X_test_copy, y_test_copy['target_sc']) return model, mape_score, mae_score, mse_score, R2_score
def model_performance(X_train, X_test, y_train, y_test): models = [ GaussianNB(), KNeighborsClassifier(), SGDClassifier(), BaggingClassifier(), DecisionTreeClassifier(), LinearSVC(penalty="l1", dual=False), SVC() ] Reg_len = len(models) i = 0 while i < Reg_len: model = models[i] model.fit(X_train, y_train) print(models[i]) print('') expected = y_test predicted = model.predict(X_test) # Evaluate fit of the model print("Mean Squared Error: %0.6f" % mse(expected, predicted)) print("Mean Absolute Error: %0.6f" % mae(expected, predicted)) print("Coefficient of Determination: %0.6f" % model.score(X_test, y_test)) print('') i = i + 1
def find_accurracy_on_testset(self, model, X_test, Y_test, clip=False, plot=True): results = model.predict(X_test) print("-----------------------------------------------------------") print("MSE: " + str(mse(Y_test, results)), "MAE: " + str(mae(Y_test, results)), "R2: " + str(math.sqrt(r2(Y_test, results)))) print("-----------------------------------------------------------") if plot: if clip: fig, ax = plt.subplots(figsize=(16, 5)) ax.plot(Y_test.values[0:100], label='True Value') ax.plot(results[0:100], label='Predicted Value') ax.set_xticks([]) ax.legend() plt.show() else: fig, ax = plt.subplots(figsize=(16, 5)) ax.plot(Y_test.values, label='True Value') ax.plot(results, label='Predicted Value') ax.set_xticks([]) ax.legend() plt.show() return None
def run_model(model, X, y, plot = False, save_fig = None): models = {'lm': linear_model.LinearRegression(), 'lasso': linear_model.LassoCV(**{'n_jobs': 4, 'n_alphas': 5.0, 'eps': 0.0005, 'max_iter': 5500, 'cv': 10}), 'lasso_no_CV': linear_model.Lasso(**{'alpha':0.00088920370018917083}), 'rf': ensemble.RandomForestRegressor(**rf_params_co2_no), 'poly2': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', linear_model.LinearRegression(fit_intercept=False))]), 'xgb' : xgb.XGBRegressor(**xgb_params_even_larger), 'svr': SVR(**svr_params), } estimator = models[model] X, y = np.asarray(X), np.asarray(y) estimator.fit(X, y) predictions = estimator.predict(X) MAE = mae(predictions, y) print(model+" train error: "+str(MAE)) if plot: plot_parity(y, predictions, save_fig = save_fig) return estimator, y, predictions, MAE
# Load the dataset from sklearn.datasets import load_linnerud linnerud_data = load_linnerud() X = linnerud_data.data y = linnerud_data.target from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_absolute_error as mae from sklearn.linear_model import LinearRegression # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. from sklearn import cross_validation X, x_test, y, y_test = cross_validation.train_test_split(X, y) reg = DecisionTreeRegressor() reg.fit(X, y) mae_dt = mae(reg.predict(x_test),y_test) print "Decision Tree mean absolute error: {:.2f}".format(mae_dt) reg = LinearRegression() reg.fit(X, y) mae_ln = mae(reg.predict(X),y) print "Linear regression mean absolute error: {:.2f}".format(mae_ln) results = { "Linear Regression": mae_ln, "Decision Tree": mae_dt }
X_all = np.asarray(pickle.load(open('datasets/data_'+dataset+'.pckl','r'))) y_all = np.asarray(pickle.load(open('datasets/energetics_'+dataset+'.pckl','r'))) parplot = True cv = 5 X_all, y_all = shuffle(X_all, y_all, random_state=42) X_all, y_all = np.asarray(X_all), np.asarray(y_all) X, X_test, y, y_test = cross_validation.train_test_split(X_all, y_all, test_size=0.1, random_state=42) for model in models: regressor, y_true, y_pred, MAE_val = run_model(model, X, y, plot=parplot, save_fig="Results/"+model+"_"+dataset+"_parity.pdf") y_pred = regressor.predict(X_test) MAE = mae(y_pred, y_test) print(model+" test error: "+str(MAE)) outliers = [] for i, (true, pred) in enumerate(zip(y_true,y_pred)): error = np.abs(true-pred) if error > MAE*3.0: outliers.append(i) pickle.dump((outliers, X, y), open("Results/outliers_"+model+"_"+dataset+".pckl","wb")) print "removing "+str(len(outliers))+" outliers ..." X = np.asarray([value for (i, value) in enumerate(X) if i not in set(outliers)]) y = np.asarray([value for (i, value) in enumerate(y) if i not in set(outliers)]) print str(len(y)) + " total samples, " + str(len(y_test)) + " test samples" regressor_new, y_true, y_pred, MAE_val = run_model(model, X, y, plot=parplot, save_fig="Results/"+model+"_"+dataset+"_parity_less_outliers.pdf")
X, y = list(zip(*examples)) X = np.array(X) y = np.array(y) del examples kf = KFold(n=len(X), n_folds=5, shuffle=True, random_state=np.random) train_index, test_index = next(iter(kf)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] del X del y print("fitting model...") mlp.fit(X_train, y_train) print("scoring model...") # print("predicted:", mlp.predict(X_test)) # print("actual:", y_test) print("R^2 score =", mlp.score(X_test, y_test)) y_pred = mlp.predict(X_test) print("MSE score =", mse(y_pred, y_test)) print("MAE score =", mae(y_pred, y_test)) print("accuracy_score =", accuracy_score([[round(y[0])] for y in y_pred], y_test)) fn = os.path.join(settings['data-base'], 'nn_tanh3.pickle') pickle.dump(mlp, open(fn, 'wb'))
def evaluate_prediction(model,X,y): y_pred = model.predict(X) return "MAE: %.4f" % mae(y,y_pred), "MSE: %.4f" % mse(y,y_pred)
n_stable=10, verbose=True) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # print("fitting model...") mlp.fit(X_train, y_train) # print("scoring model...") # print("predicted:", mlp.predict(X_test)) # print("actual:", y_test) r2s.append(mlp.score(X_test, y_test)) y_pred = mlp.predict(X_test) mses.append(mse(y_pred, y_test)) mae_score = mae(y_pred, y_test) maes.append(mae_score) # print("MAE score =", mae_score) accs.append(accuracy_score([[round(y[0])] for y in y_pred], y_test)) mean_mae = np.mean(maes) mean_mse = np.mean(mses) mean_r2 = np.mean(r2s) mean_acc = np.mean(accs) model = (mean_mse, mean_mae, mean_r2, mean_acc, dropout_rate, regularize, learning_rule, kernel_shape[0], kernel_shape[1]) if insert_model(model, mean_r2): print("**") for e in best_models: if e is None:
# Prepare the data as features and labels. features = X labels = y # split the data into training and testing sets from sklearn import cross_validation features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0) # Create decision tree regressor/algorithm object reg1 = DecisionTreeRegressor() # Train the decision tree regressor using the 'trains' ie features_train, labels_train reg1.fit(features_train, labels_train) # Get the decision tree regressor Mean Absolute Error, dtr_mae, using the 'tests' ie labels_test and features_test dtr_mae = mae(labels_test, reg1.predict(features_test)) print "Decision Tree mean absolute error: {:.2f}".format(mae(labels_test, reg1.predict(features_test))) # Create the linear Regression regressor/algorithm object reg2 = LinearRegression() # Train the linear Regression regressor using the 'trains' ie features_train, labels_train reg2.fit(features_train,labels_train) # Get the linear Regression regressor Mean Absolute Error, lr_mae, using the 'tests' ie labels_test and features_test lr_mae = mae(labels_test, reg2.predict(features_test)) print "Linear regression mean absolute error: {:.2f}".format(mae(labels_test, reg2.predict(features_test)))
# Load the dataset from sklearn.datasets import load_linnerud linnerud_data = load_linnerud() X = linnerud_data.data y = linnerud_data.target from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_absolute_error as mae from sklearn.linear_model import LinearRegression # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. from sklearn import cross_validation labels_train, labels_test, features_train, features_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) reg1 = DecisionTreeRegressor() reg1.fit(labels_train, features_train) print "Decision Tree mean absolute error: {:.2f}".format(mae(features_test, reg1.predict(labels_test))) reg2 = LinearRegression() reg2.fit(labels_train, features_train) print "Linear regression mean absolute error: {:.2f}".format(mae(features_test, reg2.predict(labels_test))) results = { "Linear Regression": mae(features_test, reg2.predict(labels_test)), "Decision Tree": mae(features_test, reg1.predict(labels_test)) }