Example #1
0
def mean_absolute_scaled_error_year_avg(joined_data,
                                        historical_data,
                                        weather_variable,
                                        years_back=19):
    """
    This function considers as "last period" the avg temperature, on the same date at the same time, over the years present in 
    the historical data (currently up to 19 years for London)

    In case the historical data does not contain the data point for any of the datetimes considered, the mae is not calculated
    and np.nan is returned

    TODO: instead of stopping the entire calculation, just discard the offending data point and continue calculation with rest of data
    """
    def previous_years_avg(dt):
        date_time = datetime.fromtimestamp(dt, tz=timezone.utc)
        return np.average([
            historical_data[weather_variable][n_years_ago(date_time, n)]
            for n in range(1, years_back + 1)
        ])

    try:
        joined_data_without_29_feb = remove_29_feb(joined_data)
        naive_prediction = [
            previous_years_avg(dt) for dt in joined_data_without_29_feb['dt']
        ]
        return [
            mae(joined_data_without_29_feb[weather_variable],
                joined_data_without_29_feb[f't{i}']) /
            mae(joined_data_without_29_feb[weather_variable], naive_prediction)
            for i in range(5, 0, -1)
        ]
    except KeyError as err:
        print(f"{err} not found in historical data")
        return np.nan
Example #2
0
def get_results_from_models(model, noises):
    results = [["noises"] + noises, ["mae_alpha"], ["stdae_alpha"],
               ["mae_beta"], ["stdae_beta"], ["r_alpha"], ["r_beta"]]
    for i in range(len(noises)):
        noise = noises[i]
        x_valid, y_valid = generate_synthetic_validation_data(noise)
        x_valid = x_valid.reshape(x_valid.shape[0], x_valid.shape[3],
                                  x_valid.shape[2], x_valid.shape[1])
        # load weights into new model
        model.load_weights("Weights/BrainCNNWeights_noise_" + str(noise) +
                           ".h5")
        print("Loaded model from disk")
        preds = model.predict(x_valid)
        results[1].append("{0:.2f}".format(100 *
                                           mae(preds[:, 0], y_valid[:, 0])))
        results[2].append("{0:.2f}".format(
            100 * std(abs(y_valid[:, 0] - preds[:, 0]))))
        results[3].append("{0:.2f}".format(100 *
                                           mae(preds[:, 1], y_valid[:, 1])))
        results[4].append("{0:.2f}".format(
            100 * std(abs(y_valid[:, 1] - preds[:, 1]))))
        results[5].append("{0:.2f}".format(pearsonr(preds[:, 0], y_valid[:,
                                                                         0])))
        results[6].append("{0:.2f}".format(pearsonr(preds[:, 1], y_valid[:,
                                                                         1])))
    display(HTML(tabulate.tabulate(results, tablefmt='html')))
Example #3
0
    def predict(self, X, treatment=None, y=None):
        """Predict treatment effects.

        Args:
            X (np.matrix): a feature matrix
            treatment (np.array): a treatment vector
            y (np.array, optional): an optional outcome vector

        Returns:
            (numpy.ndarray): Predictions of treatment effects.
        """
        yhat_c = self.model_c.predict(X)
        yhat_t = self.model_t.predict(X)

        if (y is not None) and (treatment is not None):
            is_treatment = treatment != self.control_name
            logger.info('RMSE (Control): {:.6f}'.format(
                np.sqrt(mse(y[~is_treatment], yhat_c[~is_treatment]))))
            logger.info(' MAE (Control): {:.6f}'.format(
                mae(y[~is_treatment], yhat_c[~is_treatment])))
            logger.info('RMSE (Treatment): {:.6f}'.format(
                np.sqrt(mse(y[is_treatment], yhat_t[is_treatment]))))
            logger.info(' MAE (Treatment): {:.6f}'.format(
                mae(y[is_treatment], yhat_t[is_treatment])))

        return (yhat_t - yhat_c).reshape(-1, 1)
Example #4
0
def eval(model, data, set_name, denorm_predictions=True):
    # predictions
    predictions = model.predict(data.dataset(set_name))
    labels = data.raw_data(set_name)["labels"][:len(predictions)]
    predictions = pd.DataFrame(data=predictions,
                               index=labels.index,
                               columns=labels.columns)

    if denorm_predictions:
        predictions = data.denormalize_labels(predictions)

    # results
    results = {
        "general": {
            "mae": mae(labels, predictions),
            "mape": mape(labels, predictions),
            "mse": mse(labels, predictions)
        }
    }

    for col in labels.columns:
        results[col] = {
            "mae": mae(labels[col], predictions[col]),
            "mape": mape(labels[col], predictions[col]),
            "mse": mse(labels[col], predictions[col]),
            "tend_acc": tendency_accuracy(labels[col], predictions[col])
        }

    return predictions, results
Example #5
0
def test_automl():
    X, y = make_regression(n_samples=N_OBS,
                           n_features=N_FEATURE,
                           n_informative=N_IMP_FEATURE,
                           random_state=RANDOM_SEED)
    X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    logging.info(f'X dim: {X.shape}, y dim: {y.shape}')

    X_trn, X_tst, y_trn, y_tst = train_test_split(X,
                                                  y,
                                                  test_size=.2,
                                                  random_state=RANDOM_SEED)

    model = AutoLGB(objective='regression', metric='l1')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) +
         y_trn.min())
    logging.info(f'MAE (LGB): {mae(y_tst, p):.4f}')
    assert mae(y_tst, p) < mae(y_tst, r)

    model = AutoXGB(objective='reg:linear', metric='rmse')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) +
         y_trn.min())
    logging.info(f'MAE (XGB): {mae(y_tst, p):.4f}')
    assert mae(y_tst, p) < mae(y_tst, r)
Example #6
0
def main():
    w2v_sim = sim('w2v_vecs.npy')
    w2vs_sim = sim('w2vs_vecs.npy')

    data = np.loadtxt('data/wordsim353/combined.csv',
                      skiprows=1,
                      delimiter=',',
                      dtype=np.str)

    comps = 0
    w2v_sims = []
    w2vs_sims = []
    gt_sims = []
    for w1, w2, gt_sim in data:
        if w1 in w2v_sim and w2 in w2v_sim and w1 in w2vs_sim and w2 in w2vs_sim:
            comps += 1
            w2v_sims.append(w2v_sim(w1, w2))
            w2vs_sims.append(w2vs_sim(w1, w2))
            gt_sims.append(float(gt_sim) / 10)

    print('word2vec mse:', mse(w2v_sims, gt_sims))
    print('word2vecS mse:', mse(w2vs_sims, gt_sims))

    print('word2vec mae:', mae(w2v_sims, gt_sims))
    print('word2vecS mae:', mae(w2vs_sims, gt_sims))

    print(comps, 'comparisons out of 353')
Example #7
0
    def predict(self, X, treatment, y=None):
        """Predict treatment effects.

        Args:
            X (np.matrix): a feature matrix
            treatment (np.array): a treatment vector
            y (np.array, optional): an outcome vector

        Returns:
            (numpy.ndarray): Predictions of treatment effects.
        """
        is_treatment = treatment != self.control_name
        w = is_treatment.astype(int)

        X = np.hstack((w.reshape((-1, 1)), X))

        X[:, 0] = 0  # set the treatment column to zero (the control group)
        yhat_c = self.model.predict(X)

        X[:, 0] = 1  # set the treatment column to one (the treatment group)
        yhat_t = self.model.predict(X)

        if y is not None:
            logger.info('RMSE (Control): {:.6f}'.format(
                np.sqrt(mse(y[~is_treatment], yhat_c[~is_treatment]))))
            logger.info(' MAE (Control): {:.6f}'.format(
                mae(y[~is_treatment], yhat_c[~is_treatment])))
            logger.info('RMSE (Treatment): {:.6f}'.format(
                np.sqrt(mse(y[is_treatment], yhat_t[is_treatment]))))
            logger.info(' MAE (Treatment): {:.6f}'.format(
                mae(y[is_treatment], yhat_t[is_treatment])))

        return (yhat_t - yhat_c).reshape(-1, 1)
Example #8
0
def evaluate(df, num_points, test=False):
    print('\n ----------------- MODEL EVALUATION ----------------- \n')

    df.fillna(0)

    open_true = df['open_next_day']
    open_pred = df['pred_open_next_day']
    close_true = df['close_next_day']
    close_pred = df['pred_close_next_day']

    if test:
        open_true = open_true[:-1]
        open_pred = open_pred[:-1]
        close_true = close_true[:-1]
        close_pred = close_pred[:-1]

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))

    ax[0, 0].plot(open_true[-num_points:], open_pred[-num_points:], 'go')
    ax[0, 0].set_title('Open')

    ax[0, 1].plot(close_true[-num_points:], close_pred[-num_points:], 'r^')
    ax[0, 1].set_title('Close')

    ax[1, 0].plot(open_true[-num_points:])
    ax[1, 0].plot(open_pred[-num_points:])
    ax[1, 0].set_label(['true', 'prediction'])

    ax[1, 1].plot(close_true[-num_points:])
    ax[1, 1].plot(close_pred[-num_points:])
    ax[1, 1].set_label(['true', 'prediction'])

    fig.suptitle('Model Price Predictions')
    plt.show()
    plt.close()

    mae_open = mae(open_true, open_pred)
    mae_close = mae(close_true, close_pred)

    mse_open = mse(open_true, open_pred)
    mse_close = mse(close_true, close_pred)

    r2_open = r2(open_true, open_pred)
    r2_close = r2(close_true, close_pred)

    print('OPEN PRICES')
    print('\t Mean Absolute Error: {}'.format(mae_open))
    print('\t Mean Squared Error: {}'.format(mse_open))
    print('\t R2 Score: {}'.format(r2_open))

    print('CLOSE PRICES')
    print('\t Mean Absolute Error: {}'.format(mae_close))
    print('\t Mean Squared Error: {}'.format(mse_close))
    print('\t R2 Score: {}'.format(r2_close))
    print('')
Example #9
0
    def plot_lin_regr(self, X_dataset, y_dataset, X_axis, y_axis, color, label, percentual_X=False, percentual_y=False, y_axis_min=None, y_axis_max=None):
        if percentual_X:
            X = X_dataset * 100
        else:
            X = X_dataset

        X = X.reshape(len(X), 1)
        X_lr = X.reshape(-1, bac.NUM_RUNS)[:,:bac.NUM_SAMPLES].reshape(-1, 1) # Samples used to estimate the Linear Regression

        X_line = [[i] for i in range(0, self.x_max + bac.X_MAX_PADDING)] # Used to plot streched estimated line

        if percentual_y:
            y = y_dataset * 100
        else:
            y = y_dataset

        y = y.reshape(len(y), 1)
        y_lr = y.reshape(-1, bac.NUM_RUNS)[:,:bac.NUM_SAMPLES].reshape(-1, 1) # Samples used to estimate the Linear Regression

        regr = lm.LinearRegression()
        regr.fit(X_lr, y_lr)

        if y_axis == 1: # Use secondary y axis on X_axis
            tmp_plot, = self.ax2[X_axis].plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" +
                                                                                                             # r"$R^2: " + str(regr.score(X, y)) + "$\n"
                                                                                                             # r"$MAE: " + str(mae(y, regr.predict(X))) + "$")
            print(label + " R^2: " + str(regr.score(X, y)))
            print(label + " MAE: " + str(mae(y, regr.predict(X))))
            if y_axis_min != None:
                self.y_axis_min_values[self.ax2[X_axis]] = y_axis_min
            if y_axis_max != None:
                self.y_axis_max_values[self.ax2[X_axis]] = y_axis_max
        else: # Use primary y axis on X_axis
            if isinstance(self.axarr, np.ndarray):
                tmp_plot, = self.axarr[X_axis].plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" +
                                                                                                            # r"$R^2: " + str(regr.score(X, y)) + "$\n"
                                                                                                            # r"$MAE: " + str(mae(y, regr.predict(X))) + "$")
                print(label + " R^2: " + str(regr.score(X, y)))
                print(label + " MAE: " + str(mae(y, regr.predict(X))))
                if y_axis_min != None:
                    self.y_axis_min_values[self.axarr[X_axis]] = y_axis_min
                if y_axis_max != None:
                    self.y_axis_max_values[self.axarr[X_axis]] = y_axis_max
            else:
                tmp_plot, = self.axarr.plot(X_line, regr.predict(X_line), color=color, linewidth=2, label=label)# + "\n" +
                                                                                                            # r"$R^2: " + str(regr.score(X, y)) + "$\n"
                                                                                                            # r"$MAE: " + str(mae(y, regr.predict(X))) + "$")
                print(label + " R^2: " + str(regr.score(X, y)))
                print(label + " MAE: " + str(mae(y, regr.predict(X))))
                if y_axis_min != None:
                    self.y_axis_min_values[self.axarr] = y_axis_min
                if y_axis_max != None:
                    self.y_axis_max_values[self.axarr] = y_axis_max

        self.plots.append(tmp_plot)
Example #10
0
    def predict(self,
                X,
                treatment=None,
                y=None,
                return_components=False,
                verbose=True):
        """Predict treatment effects.

        Args:
            X (np.matrix): a feature matrix
            treatment (np.array): a treatment vector
            y (np.array, optional): an optional outcome vector

        Returns:
            (numpy.ndarray): Predictions of treatment effects.
        """
        yhat_cs = {}
        yhat_ts = {}

        for group in self.t_groups:
            w = (treatment != group).astype(int)
            X_new = np.hstack((w.reshape((-1, 1)), X))

            model_c = self.models_c[group]
            model_t = self.models_t[group]
            yhat_cs[group] = model_c.predict(X_new)
            yhat_ts[group] = model_t.predict(X_new)

        if (y is not None) and (treatment is not None) and verbose:
            for group in self.t_groups:
                logger.info('Error metrics for {}'.format(group))
                logger.info('RMSE (Control): {:.6f}'.format(
                    np.sqrt(
                        mse(y[treatment != group],
                            yhat_cs[group][treatment != group]))))
                logger.info(' MAE (Control): {:.6f}'.format(
                    mae(y[treatment != group],
                        yhat_cs[group][treatment != group])))
                logger.info('RMSE (Treatment): {:.6f}'.format(
                    np.sqrt(
                        mse(y[treatment == group],
                            yhat_ts[group][treatment == group]))))
                logger.info(' MAE (Treatment): {:.6f}'.format(
                    mae(y[treatment == group],
                        yhat_ts[group][treatment == group])))

        te = np.zeros((X.shape[0], self.t_groups.shape[0]))
        for i, group in enumerate(self.t_groups):
            te[:, i] = yhat_ts[group] - yhat_cs[group]

        if not return_components:
            return te
        else:
            return te, yhat_cs, yhat_ts
Example #11
0
def mase(y_pred, y_true, method='naive', X_test=None, constant=None):
    """
    Mean absolute scaled error. MAE error of your predictions, normalized by
    MAE error of different methods predictions.

    Parameters
    -----------
    y_pred : sequence
        Predictions you want to compare to with different methods.
    y_true: sequence
        True values
    method: {'naive', 'exp_smooth', 'mean', 'median', 'constant'}
        The method used to generate y_method which is predictions to compare to
        predictions of your method
    X_test: pd.Dataframe object, optional
        Must be provided when using all methods but naive and constant
    constant: int, optional
        Must be provided if method arg is set to constant

    Returns
    --------
    mase_score : range(0,1)
        The score, that is computed as following -
        mae(y_true, y_pred)/mae(y_true, y_method). For example if method
        is 'naive' and mase score is 0.25, that means that your method is 4
        times more accurate, then the naive one.
    """

    y_method = y_pred
    if method is 'naive':
        y_method = y_true.shift()
        y_method.fillna(y_method.mean(), inplace=True)
    if method is not 'naive':
        if X_test is None:
            print('You should provide X_test to evaluate predict')
        X_test.drop([label for label in X_test.columns if 'lag_' in label],
                    inplace=True,
                    axis=1)
    if method is 'exp_smooth':
        num_lags = len(X_test.columns)
        y_method = [
            hw.additive(list(lags[1].values), num_lags, 1)[0][0]
            for lags in X_test.iterrows()
        ]
    if method is 'mean':
        y_method = X_test.mean(axis=1).values
    if method is 'median':
        y_method = X_test.mean(axis=1).values
    if method is 'constant':
        y_method = np.full(y_true.shape, constant)
    return mae(y_true, y_pred) / mae(y_true,
                                     y_method)  # todo fix division by zero
Example #12
0
def get_results(y_test, y_pred):

    y_test_spike, y_pred_spike, y_test_normal, y_pred_normal = split_regions(
        y_test, y_pred)

    return {
        "rmse_general": mse(y_test, y_pred, squared=False),
        "mae_general": mae(y_test, y_pred),
        "rmse_spike": mse(y_test_spike, y_pred_spike, squared=False),
        "mae_spike": mae(y_test_spike, y_pred_spike),
        "rmse_normal": mse(y_test_normal, y_pred_normal, squared=False),
        "mae_normal": mae(y_test_normal, y_pred_normal),
    }
Example #13
0
def get_ave_metrics(predictions,name):
    mse_train_list = []
    mae_train_list = []
    r_train_list = []
    
    mse_test_list = []
    mae_test_list = []
    r_test_list = []
    
    for ytr,tr_pred,yte,te_pred in zip(predictions['ytr'],predictions['tr_preds'],
                                       predictions['yte'],predictions['te_preds']):
        mse_train_list.append(mse(ytr,tr_pred))
        mae_train_list.append(mae(ytr,tr_pred))
        r_train_list.append(pearsonr(ytr,tr_pred)[0])
        
        mse_test_list.append(mse(yte,te_pred))
        mae_test_list.append(mae(yte,te_pred))
        r_test_list.append(pearsonr(yte,te_pred)[0])
        
        
    results = {'mse_train_ave':0,
               'mse_train_std':0,
               'mae_train_ave':0,
               'mae_train_std':0,
               'pearsonr_train_ave':0,
               'pearsonr_train_std':0,
               'mse_test_ave':0,
               'mse_test_std':0,
               'mae_test_ave':0,
               'mae_test_std':0,
               'pearsonr_test_ave':0,
               'pearsonr_test_std':0}
    
    results['mse_train_ave']=np.average(mse_train_list)
    results['mse_train_std']=np.std(mse_train_list)
    results['mae_train_ave']=np.average(mae_train_list)
    results['mae_train_std']=np.std(mae_train_list)
    r_train_list = np.array(r_train_list).reshape(-1)
    results['pearsonr_train_ave']=np.average(r_train_list)
    results['pearsonr_train_std']=np.std(r_train_list)
    
    results['mse_test_ave']=np.average(mse_test_list)
    results['mse_test_std']=np.std(mse_test_list)
    results['mae_test_ave']=np.average(mae_test_list)
    results['mae_test_std']=np.std(mae_test_list)
    r_test_list = np.array(r_test_list).reshape(-1)
    results['pearsonr_test_ave']=np.average(r_test_list)
    results['pearsonr_test_std']=np.std(r_test_list)
    
    return pd.DataFrame(results,index=[name]).T
Example #14
0
def plot_ts(country):
    '''
    plot out the y_true vs y_pred, given the country, all_data, and all_models
    '''
    version_ = re.sub("\.", "_", str(MODEL_VERSION))
    all_data, all_models = pickle.load(
        open(os.path.join("models", f"all_data_model-{version_}.pickle"),
             "rb"))
    y_true = all_data[country]['y']
    y_pred = all_models[country].predict(all_data[country]['X'])
    all_dates = all_data[country]['dates']
    rmse_ = round(mse(y_true, y_pred, squared=False), 2)
    mae_ = round(mae(y_true, y_pred), 2)
    mape_ = round(mape(y_true, y_pred), 2)

    # fig = go.Figure()
    # fig.add_trace(go.Scatter(x=all_dates, y=y_true, name='Actual Revenue'))
    # fig.add_trace(go.Scatter(x=all_dates, y=y_pred, name='Predicted Revenue'))
    #
    # fig.update_layout(title=f"{country.replace('_',' ').title()}: RMSE:{rmse_}, MAE:{mae_}, MAPE:{mape_}%",
    #                   yaxis_title="Revenue")
    # fig.show()
    plt.figure(figsize=(12, 4))
    plt.title(
        f"Model for {country.replace('_',' ').title()}: RMSE:{rmse_}, MAE:{mae_}, MAPE:{mape_}%"
    )
    plt.plot(pd.to_datetime(all_dates), y_true, label='Actual Revenue')
    plt.plot(pd.to_datetime(all_dates), y_pred, label='Predict Revenue')
    plt.legend()
    plt.show()
def model_evaluation_rdg(y_test, y_pred_rdg):
    from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score
    print("\n---- Ridge Regressionn   - Model Evaluation ----")
    print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_rdg)))
    print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_rdg)))
    print("Root Mean Squared Error (RMSE): {}".format(
        np.sqrt(mse(y_test, y_pred_rdg))))
Example #16
0
def metric(actual, predicted):
    e_mse = mse(actual, predicted)
    e_mae = mae(actual, predicted)
    e_r2 = r2(actual, predicted)
    e_agm = ((sqrt(e_mse) + e_mae) / 2) * (1 - e_r2)

    return e_mse, sqrt(e_mse), e_mae, e_r2, e_agm
Example #17
0
def model_eval(y_data, x_data, model):
    y = []
    yhat = []
    for i in range(len(y_data)):
        y.append(y_data[i])
        yhat.append(float(model.predict([[x_data[i]]])))
    return mae(y, yhat)
Example #18
0
def model_creation(data,labels,features):
    logging.info('='*40)
    X=data[features]
    
    best_maes=[]
    
    for label in labels:
        logging.info('-'*40)
        y=data[label]
        logging.info('Beginning model testing for label {0}. {1}% Complete.'.format(label))
        #best_model=None
        best_model_mae=999999999
        for i in range(25): # previously 15
            train_X,val_X,train_y,val_y=tts(X,y)
            
            model=RandomForestRegressor()
            
            model.fit(train_X,train_y)
            
            val_predictions=model.predict(val_X)
            val_mae=mae(val_predictions,val_y)
            
            if val_mae<best_model_mae:
                #best_model=model
                best_model_mae=val_mae
                logging.info('**New best model achieved below. Iteration #{}'.format(i))
                
            logging.info('Validation MAE: {:,.2f}'.format(val_mae))
        best_maes.append(best_model_mae)
    return best_maes
Example #19
0
def train_model(x_data, y_data, k=5):
    models = []
    scores = []
    k_fold = KFold(n_splits=k, shuffle=True, random_state=123)

    for train_idx, val_idx in k_fold.split(x_data):
        x_train, y_train = x_data[train_idx, :], y_data[train_idx]
        x_val, y_val = x_data[val_idx, :], y_data[val_idx]

        d_train = lgbm.Dataset(data=x_train, label=y_train)
        d_val = lgbm.Dataset(data=x_val, label=y_val)

        params = {
            'n_estimators': 5000,
            'learning_rate': 0.8,
            'max_depth': 5,
            'boosting_type': 'dart',
            'drop_rate': 0.3,
            'objective': 'regression',
            'metric': 'mae',
            'is_training_metric': True,
            'num_leaves': 200,
            'colsample_bytree': 0.7,
            'subsample': 0.7
        }
        wlist = {'train': d_train, 'eval': d_val}
        model = lgbm.train(params=params,
                           train_set=d_train,
                           valid_sets=d_val,
                           evals_result=wlist)
        models.append(model)
        scores.append(mae(y_val, model.predict(x_val)))

    return models[np.argmin(scores)]
Example #20
0
def calculate_metrics(target, pred, bins=0, returns=False):
    """Calculate following metrics:
    * MAE
    * MAPE
    * Percentage of error less than 30%

    Parameters
    ----------
    target : list or np.array
        array with answers
    pred : list or np.array
        array with predictions
    bins : int
        number of bins in histogram, if 0 the histogram will not be displayed
    returns : bool
        if True, metrics will be returned
    """
    mape = np.abs(target - pred) / target
    mae_val = mae(target, pred)
    perc = np.mean(mape < 0.3) * 100
    print('MAE: {:.4}'.format(mae_val))
    print('MAPE: {:.4}'.format(np.mean(mape)))
    print('Percentage of error less than 30%: {:.4}%'.format(perc))
    if bins:
        plt.figure(figsize=(8, 6))
        sns.distplot(mape, bins=bins)
        plt.title('MAPE hist')
        plt.show()
    if returns:
        return np.mean(mape), mae_val, perc
    return None
Example #21
0
def QC_info(fg_hist, bg_hist, out_filename):
    """
    Compute QC metrics and print info to out_filename.
    """
    mean_abs_error = mae(fg_hist, bg_hist)
    chi_stat, chi_pval = power_div(fg_hist, bg_hist)
    gof_stat, gof_pval = power_div(fg_hist, bg_hist, "cressie-read")
    gte_stat, gte_pval = power_div(fg_hist, bg_hist, "log-likelihood")
    with open(out_filename, 'w') as stream:
        if sum(fg_hist) < 1000 or sum(bg_hist) < 1000:
            stream.write("QC tests cannot be ")
            stream.write("computed due to a small number of samples ")
            stream.write("(less than 1000).\n")
        elif chi_pval == -1 or gof_pval == -1:
            stream.write("QC tests cannot be ")
            stream.write("computed due to a large number of values ")
            stream.write("with low frequencies (more than ")
            stream.write("20% of values <=5).\n")
        else:
            stream.write("mean_absolute_error\t%f\n" % mean_abs_error)
            stream.write("chi-square(statistic, pvalue)\t(%f, %f)\n" %
                         (chi_stat, chi_pval))
            stream.write("cressie-read goodness_of_fit(statistic, pvalue)")
            stream.write("\t(%f, %f)\n" % (gof_stat, gof_pval))
            stream.write("G-test goodness_of_fit(statistic, pvalue)")
            stream.write("\t(%f, %f)\n" % (gte_stat, gte_pval))
def eval_reg(y_test, predictions):
    '''
    Function:
    Evaluates a regression model through its main metrics
    '''
    print("### MEASURES OF REGRESSION MODEL ###")
    print("------------------------------------\n")

    print("R2 = {0:.4f}\n".format(r2_score(y_test, predictions)))  # R2
    print("RMSE = {0:.4f}\n".format(mse(
        y_test, predictions, squared=False)))  # Root Mean Squared Error
    print("MSE = {0:.4f}\n".format(mse(y_test, predictions,
                                       squared=True)))  # Mean Squared Error

    if len(predictions[predictions < 0]) > 0:
        print(
            "MSLE not possible to be applied. Predicitons contain negative values.\n"
        )
    else:
        print("MSLE = {0:.4f}\n".format(msle(
            y_test, predictions)))  # Mean Squared Log Error

    print("MAE = {0:.4f}\n".format(mae(y_test,
                                       predictions)))  # Mean Absolute Error
    print("EVS = {0:.4%}\n".format(evs(
        y_test, predictions)))  # Explained Variance Score
Example #23
0
def sklearn_acc(model, test_data, test_target):
    overall_results = model.predict(test_data)
    test_pred = (overall_results > 0.5).astype(int)
    acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target),
                   f1_score(test_pred, test_target, average='macro')]

    return acc_results
Example #24
0
    def prediction_eval(prediction, real_data):
        '''  
        This functino compute and print four differents metrics (mse ,mae ,r2 and median) to evaluate accuracy of the model
        prediction and real_data need to have the same size
        
        Parameters
        ----------
        prediction : array
            predicted values.
        real_data : array
            real data.
        
        Returns
        -------
        None.
        
        '''
        from sklearn.metrics import mean_absolute_error as mae
        from sklearn.metrics import mean_squared_error as mse
        from sklearn.metrics import median_absolute_error as medae
        from sklearn.metrics import r2_score as r2

        print("mean_absolute_error : ", mae(real_data, prediction))
        print("mean_squared_error : ", mse(real_data, prediction))
        print("median_absolute_error : ", medae(real_data, prediction))
        print("r2_score : ", r2(real_data, prediction))
Example #25
0
def bmae(pred, true, types):
    log_maes = []
    for utype in np.unique(types):
        mask = types == utype
        utype_mae = np.max([mae(true[mask], pred[mask]), 1e-9])
        log_maes.append(np.log(utype_mae))
    return np.mean(log_maes)
Example #26
0
def testing(config_path: Text) -> None:

    config = yaml.safe_load(open(config_path))
    log_target = config["feature_transform"]["log_target"]
    eval_mae = config["test"]["mean_absolute_error"]
    eval_r2 = config["test"]["r2_score"]

    model = load_pickle("stages/model.pkl")
    X_test = pd.read_csv("stages/X_test.csv")
    y_test = pd.read_csv("stages/y_test.csv").iloc[:, 0]

    y_pred = model.predict(X_test)

    if log_target:
        y_pred = np.exp(y_pred)
        y_test = np.exp(y_test)

    metrics = {}

    if eval_mae:
        metrics.update({"mean_absolute_error": mae(y_test, y_pred)})

    if eval_r2:
        metrics.update({"R2": r2_score(y_test, y_pred)})
        # metrics.append({"score": "R2", "value": r2_score(y_test, y_pred)})

    # pd.DataFrame(metrics).to_csv("stages/metrics.csv", index=False)

    json.dump(obj=metrics, fp=open("stages/metrics.json", "w"))
Example #27
0
def try_fit_predict_RandomForest(train_df, test_df, index_df, savename):
    X_data = train_df.drop(['scalar_coupling_constant'], axis=1).values.astype('float32')
    y_data = train_df['scalar_coupling_constant'].values.astype('float32')
    test_feature = test_df

    X_train, X_test, y_train, y_test = train_test_split(X_data , y_data , test_size=0.33, random_state=128)

    # LGBMRegressorによる予測

    params = {'n_estimators'  : [500], 'n_jobs': [-1]}
    forest = RandomForestRegressor()
    model = GridSearchCV(forest, params, cv = 3)

    print('Start Fitting')
    model.fit(X_train, y_train)
    print('Start Getting Mae')
    prediction_rf_mae =  model.predict(X_test)
    Err = mae(y_test, prediction_rf_mae)
    acc_dic[savename] = Err
    print('Start Predicting')
    prediction_rf =  model.predict(test_feature)

    index_df['scalar_coupling_constant'] = prediction_rf
    csv_title = 'result_' + savename + '.csv'
    index_df.to_csv(csv_title)

    return prediction_rf
 def writing_results(self):
     self.result_file_name = f"{sys.argv[2][0:20]}_results.txt"
     self.result_file = open(self.result_file_name, 'w+', encoding='utf-8')
     print("\nResults given in [ppm]:\n")
     header = "Hydrogen\t%s\t%s\t%10s\t%s\n" % (
         u'Theoretical', '\tExperimental', '\tError', '\tRelative error')
     print(header)
     for i in range(len(self.computedPeaks)):
         print(u"%dH\t%19.4f\t%23.4f\t%10.4f\t%13.4f" %
               (self.atom_numbers[i], self.computedPeaks[i],
                self.empiricalPeaks[i],
                self.empiricalPeaks[i] - self.computedPeaks[i],
                (self.empiricalPeaks[i] - self.computedPeaks[i]) /
                self.computedPeaks[i]))
         self.result_file.write(
             u"%dH\t%19.4f\t%23.4f\t%10.4f\t%13.4f\n" %
             (self.atom_numbers[i], self.computedPeaks[i],
              self.empiricalPeaks[i],
              self.empiricalPeaks[i] - self.computedPeaks[i],
              (self.empiricalPeaks[i] - self.computedPeaks[i]) /
              self.computedPeaks[i]))
     from sklearn.metrics import mean_absolute_error as mae
     MAE = mae(self.empiricalPeaks, self.computedPeaks)
     print(f"MAE: {MAE} ppm")
     self.result_file.close()
Example #29
0
def model_scores(model=None,
                 X_train=None,
                 X_test=None,
                 y_train=None,
                 y_test=None,
                 target_scaler=None,
                 scale_target=True):

    # Erstelle Kopien der Daten, damit sie nicht verändert werden
    X_train_copy, X_test_copy, y_train_copy, y_test_copy = X_train.copy(
    ), X_test.copy(), y_train.copy(), y_test.copy()

    # Berechne Vorhersagen für skalierte oder unskalierte Zielvariable
    if scale_target:
        model.fit(X_train_copy, y_train_copy[['target_sc']])
        y_test_copy['predict_sc'] = model.predict(X_test_copy)
        y_test_copy['prediction'] = target_scaler.inverse_transform(
            y_test_copy['predict_sc'])
    else:
        model.fit(X_train_copy, y_train_copy[target])
        y_test_copy['prediction'] = model.predict(X_test_copy)

    # Modellgüte berechnen
    mape_score = np.mean(
        np.abs((y_test_copy[target] - y_test_copy['prediction']) /
               y_test_copy[target])) * 100
    mae_score = mae(y_test_copy[target], y_test_copy['prediction'])
    mse_score = mse(y_test_copy[target],
                    y_test_copy['prediction'],
                    squared=False)
    R2_score = model.score(X_test_copy, y_test_copy['target_sc'])

    return model, mape_score, mae_score, mse_score, R2_score
Example #30
0
def model_performance(X_train, X_test, y_train, y_test):
    models = [
        GaussianNB(),
        KNeighborsClassifier(),
        SGDClassifier(),
        BaggingClassifier(),
        DecisionTreeClassifier(),
        LinearSVC(penalty="l1", dual=False),
        SVC()
    ]

    Reg_len = len(models)

    i = 0
    while i < Reg_len:
        model = models[i]
        model.fit(X_train, y_train)
        print(models[i])
        print('')

        expected = y_test
        predicted = model.predict(X_test)

        # Evaluate fit of the model
        print("Mean Squared Error: %0.6f" % mse(expected, predicted))
        print("Mean Absolute Error: %0.6f" % mae(expected, predicted))
        print("Coefficient of Determination: %0.6f" %
              model.score(X_test, y_test))
        print('')

        i = i + 1
Example #31
0
    def find_accurracy_on_testset(self,
                                  model,
                                  X_test,
                                  Y_test,
                                  clip=False,
                                  plot=True):
        results = model.predict(X_test)
        print("-----------------------------------------------------------")
        print("MSE: " + str(mse(Y_test, results)),
              "MAE: " + str(mae(Y_test, results)),
              "R2: " + str(math.sqrt(r2(Y_test, results))))
        print("-----------------------------------------------------------")

        if plot:
            if clip:
                fig, ax = plt.subplots(figsize=(16, 5))
                ax.plot(Y_test.values[0:100], label='True Value')
                ax.plot(results[0:100], label='Predicted Value')
                ax.set_xticks([])
                ax.legend()
                plt.show()

            else:
                fig, ax = plt.subplots(figsize=(16, 5))
                ax.plot(Y_test.values, label='True Value')
                ax.plot(results, label='Predicted Value')
                ax.set_xticks([])
                ax.legend()
                plt.show()

        return None
def run_model(model, X, y, plot = False, save_fig = None):
    models = {'lm': linear_model.LinearRegression(),
              'lasso': linear_model.LassoCV(**{'n_jobs': 4, 'n_alphas': 5.0, 'eps': 0.0005, 'max_iter': 5500, 'cv': 10}),
              'lasso_no_CV': linear_model.Lasso(**{'alpha':0.00088920370018917083}),
              'rf': ensemble.RandomForestRegressor(**rf_params_co2_no),
              'poly2': Pipeline([('poly', PolynomialFeatures(degree=2)),
                                 ('linear', linear_model.LinearRegression(fit_intercept=False))]),
              'xgb' : xgb.XGBRegressor(**xgb_params_even_larger), 
              'svr': SVR(**svr_params),
             }
    estimator = models[model]
    X, y = np.asarray(X), np.asarray(y)

    estimator.fit(X, y)
    predictions = estimator.predict(X)
    MAE = mae(predictions, y)
    print(model+" train error: "+str(MAE))
    
    if plot:
        plot_parity(y, predictions, save_fig = save_fig)
    
    return estimator, y, predictions, MAE
Example #33
0
# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X, x_test, y, y_test = cross_validation.train_test_split(X, y)

reg = DecisionTreeRegressor()
reg.fit(X, y)
mae_dt = mae(reg.predict(x_test),y_test)
print "Decision Tree mean absolute error: {:.2f}".format(mae_dt)

reg = LinearRegression()
reg.fit(X, y)
mae_ln = mae(reg.predict(X),y)
print "Linear regression mean absolute error: {:.2f}".format(mae_ln)

results = {
 "Linear Regression": mae_ln,
 "Decision Tree": mae_dt
}
X_all = np.asarray(pickle.load(open('datasets/data_'+dataset+'.pckl','r')))
y_all = np.asarray(pickle.load(open('datasets/energetics_'+dataset+'.pckl','r')))

parplot = True
cv = 5

X_all, y_all = shuffle(X_all, y_all, random_state=42)
X_all, y_all = np.asarray(X_all), np.asarray(y_all)

X, X_test, y, y_test = cross_validation.train_test_split(X_all, y_all, test_size=0.1, random_state=42)

for model in models:
    regressor, y_true, y_pred, MAE_val = run_model(model, X, y, plot=parplot, save_fig="Results/"+model+"_"+dataset+"_parity.pdf")
    y_pred = regressor.predict(X_test)
    MAE = mae(y_pred, y_test)
    print(model+" test error: "+str(MAE))
    
    outliers = []
    for i, (true, pred) in enumerate(zip(y_true,y_pred)):
        error = np.abs(true-pred)
        if error  > MAE*3.0:
            outliers.append(i)
    pickle.dump((outliers, X, y), open("Results/outliers_"+model+"_"+dataset+".pckl","wb"))
    
    print "removing "+str(len(outliers))+" outliers ..."
    X = np.asarray([value for (i, value) in enumerate(X) if i not in set(outliers)])
    y = np.asarray([value for (i, value) in enumerate(y) if i not in set(outliers)])
    print str(len(y)) + " total samples, " + str(len(y_test)) + " test samples"
    
    regressor_new, y_true, y_pred, MAE_val = run_model(model, X, y, plot=parplot, save_fig="Results/"+model+"_"+dataset+"_parity_less_outliers.pdf")
Example #35
0
    X, y = list(zip(*examples))
    X = np.array(X)
    y = np.array(y)
    del examples

    kf = KFold(n=len(X), n_folds=5, shuffle=True, random_state=np.random)
    train_index, test_index = next(iter(kf))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    del X
    del y

    print("fitting model...")
    mlp.fit(X_train, y_train)

    print("scoring model...")
    # print("predicted:", mlp.predict(X_test))
    # print("actual:", y_test)
    print("R^2 score =", mlp.score(X_test, y_test))
    y_pred = mlp.predict(X_test)
    print("MSE score =", mse(y_pred, y_test))
    print("MAE score =", mae(y_pred, y_test))
    print("accuracy_score =", accuracy_score([[round(y[0])] for y in y_pred], y_test))

    fn = os.path.join(settings['data-base'], 'nn_tanh3.pickle')
    pickle.dump(mlp, open(fn, 'wb'))




Example #36
0
def evaluate_prediction(model,X,y):
	y_pred = model.predict(X)
	return "MAE: %.4f" % mae(y,y_pred), "MSE: %.4f" % mse(y,y_pred)
Example #37
0
                n_stable=10,
                verbose=True)

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # print("fitting model...")
            mlp.fit(X_train, y_train)

            # print("scoring model...")
            # print("predicted:", mlp.predict(X_test))
            # print("actual:", y_test)
            r2s.append(mlp.score(X_test, y_test))
            y_pred = mlp.predict(X_test)
            mses.append(mse(y_pred, y_test))
            mae_score = mae(y_pred, y_test)
            maes.append(mae_score)
            # print("MAE score =", mae_score)
            accs.append(accuracy_score([[round(y[0])] for y in y_pred], y_test))

        mean_mae = np.mean(maes)
        mean_mse = np.mean(mses)
        mean_r2 = np.mean(r2s)
        mean_acc = np.mean(accs)

        model = (mean_mse, mean_mae, mean_r2, mean_acc, dropout_rate, regularize, learning_rule, kernel_shape[0], kernel_shape[1])

        if insert_model(model, mean_r2):
            print("**")
            for e in best_models:
                if e is None:
# Prepare the data as features and labels.
features = X
labels = y

# split the data into training and testing sets
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0)

# Create decision tree regressor/algorithm object
reg1 = DecisionTreeRegressor()

# Train the decision tree regressor using the 'trains' ie features_train, labels_train
reg1.fit(features_train, labels_train)

# Get the decision tree regressor Mean Absolute Error, dtr_mae, using the 'tests' ie labels_test and features_test
dtr_mae = mae(labels_test, reg1.predict(features_test))

print "Decision Tree mean absolute error: {:.2f}".format(mae(labels_test, reg1.predict(features_test)))


# Create the linear Regression regressor/algorithm object
reg2 = LinearRegression()

# Train the linear Regression regressor using the 'trains' ie features_train, labels_train
reg2.fit(features_train,labels_train)

# Get the linear Regression regressor Mean Absolute Error, lr_mae, using the 'tests' ie labels_test and features_test
lr_mae = mae(labels_test, reg2.predict(features_test))

print "Linear regression mean absolute error: {:.2f}".format(mae(labels_test, reg2.predict(features_test)))
# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
labels_train, labels_test, features_train, features_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) 


reg1 = DecisionTreeRegressor()
reg1.fit(labels_train, features_train)
print "Decision Tree mean absolute error: {:.2f}".format(mae(features_test, reg1.predict(labels_test)))

reg2 = LinearRegression()
reg2.fit(labels_train, features_train)
print "Linear regression mean absolute error: {:.2f}".format(mae(features_test, reg2.predict(labels_test)))

results = {
 "Linear Regression": mae(features_test, reg2.predict(labels_test)),
 "Decision Tree": mae(features_test, reg1.predict(labels_test))
}