Ejemplo n.º 1
0
def train_model(rf, healed_data, target_string):
    #rf.fit(healed_data["train_features"], healed_data["train_target"])
    model = Ridge()
    visualizer = ResidualsPlot(rf)
    try:
        visualizer.fit(healed_data["train_features"],
                       healed_data["train_target"])
    except Exception as e:
        st.error("Fit error: " + str(e))

    try:
        visualizer.score(healed_data["test_features"],
                         healed_data["test_target"])
    except Exception as e:
        st.error("Score error: " + str(e))

    visualizer.show()
    # st.write(visualizer)
    st.pyplot(plt.savefig("models/rf_reg_eval_" + target_string + ".png"))
    # save model output
    model_output_loc = "models/rf_reg_" + target_string + "_rf_reg_model.pkl"
    model_output = open(model_output_loc, "wb")
    pickle.dump(rf, model_output)
    model_output.close()
    print("saving model to: " + model_output_loc)
    return
Ejemplo n.º 2
0
def uniRegression(p, xLabel, yLabel):
    global image_num
    # Randomly shuffle rows
    p = p.sample(frac=1).reset_index(drop=True)
    # Split train and test
    twentyPercent = -1 * round(p.shape[0] * 0.2)
    xCol = p[xLabel].values.reshape(-1, 1)
    X_train = xCol[:twentyPercent]
    X_test = xCol[twentyPercent:]
    y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1)
    y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1)
    # Fit linear regression model
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    # Make predictions
    predicted = lr.predict(X_test)
    r2 = r2_score(y_test, predicted)
    mse = mean_squared_error(y_test, predicted)
    # Plot expected vs. predicted
    plt.scatter(X_test, y_test, color='black')
    plt.plot(X_test, predicted, color='blue', linewidth=2)
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.show()
    plt.savefig(image_path.format(image_num), bbox_inches='tight')
    image_num += 1
    print("R2 = ", r2)
    print("MSE = ", mse)
    visualizer = ResidualsPlot(lr)
    # Plot residuals
    visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data
    visualizer.show()  # Finalize and render the figure
Ejemplo n.º 3
0
def residuals():
    X, y = load_concrete()
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    oz = ResidualsPlot(Ridge(), ax=newfig())
    oz.fit(X_train, y_train)
    oz.score(X_test, y_test)
    savefig(oz, "residuals")
Ejemplo n.º 4
0
def residual_plot(model_properties=None, output_path=None):
    '''
    Method that shows the residual plot of the trained model
    '''
    if model_properties is None or output_path is None:
        raise ValueError('Need Model properties and Output path as arguments !')
    estimator = model_properties['estimator']
    X_train = model_properties['X_train']
    y_train = model_properties['y_train']
    X_validation = model_properties['X_validation']
    y_validation = model_properties['y_validation']
    config_map = model_properties['config_map']
    X_scaler = model_properties['X_scaler']
    y_scaler = model_properties['y_scaler']
    X_train[config_map['scale_columns']] = X_scaler.transform(
        X_train[config_map['scale_columns']])
    y_train[config_map['label']] = y_scaler.transform(
        y_train[config_map['label']])
    X_validation[config_map['scale_columns']] = X_scaler.transform(
        X_validation[config_map['scale_columns']])
    y_validation[config_map['label']] = y_scaler.transform(
        y_validation[config_map['label']])
    visualizer = ResidualsPlot(estimator)
    visualizer.fit(X_train.values, y_train.values)
    visualizer.score(X_validation.values, y_validation.values)
    visualizer.poof(outpath=os.path.join(output_path, 'residual_plot.png'))
    return None
Ejemplo n.º 5
0
    def residual_plot(lin_model,x_train, y_train, x_test, y_test):
        fig = plt.figure(figsize=(16,12))
        ax = fig.add_subplot(111)
        visualizer = ResidualsPlot(lin_model, ax=ax)

        fig = plt.figure(figsize=(16,12))
        visualizer.fit(x_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(x_test, y_test)  # Evaluate the model on the test data
        visualizer.show()
Ejemplo n.º 6
0
def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    _, ax = plt.subplots()

    visualizer = ResidualsPlot(model, ax=ax, **kwargs)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.poof(outpath=outpath)
Ejemplo n.º 7
0
def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    _, ax = plt.subplots()

    visualizer = ResidualsPlot(model, ax=ax, **kwargs)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.poof(outpath=outpath)
Ejemplo n.º 8
0
def linregress(*args):
    #import dependencies
    import sklearn as sk
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    from sklearn import feature_selection
    import statsmodels.api as sm
    from patsy import dmatrices
    import numpy as np

    #define arguments
    dataframe = args[0]
    y = args[1]
    xvars = []
    for i in range(2, len(args)):
        xvars.append(args[i])
    x = dataframe[[item for item in xvars]]
    y = dataframe[y]
    #fit the model
    model.fit(x, y)

    #Generate Fit Statistics
    ##prep data for patsy
    list = []
    for item in xvars:
        list.append(f' + {item}')
    string = "".join(list)
    newstring = string[3:]

    ind = args[1]
    ind = ind.strip('"')

    ##Fit the Model
    Y, X = dmatrices(f"{ind} ~ {newstring}",
                     data=dataframe,
                     return_type="dataframe")
    logit = sm.Logit(Y, X)
    logit_result = logit.fit()

    #Print Log Odds
    print("LOG ODDS")
    print(logit_result.summary())
    print(np.exp(logit_result.params))

    #Plot the Residuals
    print("\n Residual Plot")
    from sklearn.linear_model import Ridge
    from yellowbrick.datasets import load_concrete
    from yellowbrick.regressor import ResidualsPlot

    model = Ridge()

    visualizer = ResidualsPlot(model, hist=True)
    y2 = y.values.reshape(-1, 1)
    visualizer.fit(x, y2)  # Fit the training data to the visualizer
    visualizer.score(x, y2)  # Evaluate the model on the test data
    visualizer.show()  # Finalize and render the figure
Ejemplo n.º 9
0
def residuals_plot(model, X_test, y_test, road):
    """
    param 
    model : 已训练好的模型
    X_test : 测试集数据
    y_test : 测试集标签
    """
    visualizer = ResidualsPlot(model)
    visualizer.score(X_test, y_test)
    visualizer.poof(road)
Ejemplo n.º 10
0
def log_residuals_chart(regressor,
                        X_train,
                        X_test,
                        y_train,
                        y_test,
                        experiment=None):
    """Log residuals chart.

    Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method.

    Tip:
        Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example.

    Args:
        regressor (:obj:`regressor`):
            | Fitted sklearn regressor object
        X_train (:obj:`ndarray`):
            | Training data matrix
        X_test (:obj:`ndarray`):
            | Testing data matrix
        y_train (:obj:`ndarray`):
            | The regression target for training
        y_test (:obj:`ndarray`):
            | The regression target for testing
        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
            | Neptune ``Experiment`` object to control to which experiment you log the data.
            | If ``None``, log to currently active, and most recent experiment.

    Returns:
        ``None``

    Examples:
        .. code:: python3

            rfr = RandomForestRegressor()
            rfr.fit(X_train, y_train)

            neptune.init('my_workspace/my_project')
            exp = neptune.create_experiment()

            log_residuals_chart(rfr, X_train, X_test, y_train, y_test, experiment=exp)
    """
    assert is_regressor(regressor), 'regressor should be sklearn regressor.'
    exp = _validate_experiment(experiment)

    try:
        fig, ax = plt.subplots()
        visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax)
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.finalize()
        exp.log_image('charts_sklearn', fig, image_name='Residuals Plot')
        plt.close(fig)
    except Exception as e:
        print('Did not log residuals chart. Error: {}'.format(e))
Ejemplo n.º 11
0
def regression_sanity_check(model, X_train, X_test, y_train, y_test):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
    plt.sca(ax1)
    visualizer = ResidualsPlot(model, ax=ax1)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    plt.sca(ax2)
    visualizer2 = PredictionError(model, ax=ax2)
    visualizer2.fit(X_train, y_train)
    visualizer2.score(X_test, y_test)
    visualizer.finalize()
    visualizer2.poof()
Ejemplo n.º 12
0
	def visualize_residuals_plot(self, model_info):
		model = model_info['model']	   
		X_train = model_info['X_train']
		X_test = model_info['X_test']
		Y_train = model_info['Y_train']
		Y_test = model_info['Y_test']

		visualizer = ResidualsPlot(model)

		visualizer.fit(X_train, Y_train)  # Fit the training data to the model
		visualizer.score(X_test, Y_test)  # Evaluate the model on the test data
		visualizer.poof()				  # Draw/show/poof the data
Ejemplo n.º 13
0
def test_for_homoscedasticity(X_train, y_train, X_test, y_test):
    """ Plot the data and check for homoscedasticity.
    Arguments:
    X_train (dataframe): examples in the training set
    X_test (dataframe): examples in the test set
    y_train (dataframe): target in the training set
    y_train (dataframe): target in the test set
    """
    lr = LinearRegression()
    visualizer = ResidualsPlot(lr)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    #there should be no clear pattern
    visualizer.poof()
Ejemplo n.º 14
0
def create_residuals_chart(regressor, X_train, X_test, y_train, y_test):
    """Create residuals chart.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        regressor (:obj:`regressor`):
            | Fitted sklearn regressor object
        X_train (:obj:`ndarray`):
            | Training data matrix
        X_test (:obj:`ndarray`):
            | Testing data matrix
        y_train (:obj:`ndarray`):
            | The regression target for training
        y_test (:obj:`ndarray`):
            | The regression target for testing

    Returns:
        ``neptune.types.File`` object that you can assign to run's ``base_namespace``.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            rfr = RandomForestRegressor()
            rfr.fit(X_train, y_train)

            run = neptune.init(project='my_workspace/my_project')
            run['visuals/residuals'] = npt_utils.create_residuals_chart(rfr, X_train, X_test, y_train, y_test)
    """
    assert is_regressor(regressor), 'regressor should be sklearn regressor.'

    chart = None

    try:
        fig, ax = plt.subplots()
        visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax)
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.finalize()
        chart = neptune.types.File.as_image(fig)
        plt.close(fig)
    except Exception as e:
        print('Did not log residuals chart. Error: {}'.format(e))

    return chart
def plotResidualsAgainstHoldout(df, holdOut_df, task, seed, schema):
    X_train = df[COLUMNS.get(task)].values
    X_test = holdOut_df[COLUMNS.get(task)].values
    y_train = df[TARGETS.get(task)].values
    y_test = holdOut_df[TARGETS.get(task)].values

    # Instantiate the linear model and visualizer
    wrapped_model = LinearRegression()
    visualizer = ResidualsPlot(wrapped_model, title="Residuals for schema {}".format(schema))

    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data
    visualizer.show(outpath="figs/residuals_{}_seed{}_{}.png".format(task, seed, schema))
    plt.close()
Ejemplo n.º 16
0
def residuals(ax):
    from sklearn.linear_model import RidgeCV
    from yellowbrick.regressor import ResidualsPlot

    features = [
        'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'
    ]

    splits = load_data('concrete', cols=features, target='strength', tts=True)
    X_train, X_test, y_train, y_test = splits

    estimator = RidgeCV()
    visualizer = ResidualsPlot(estimator, ax=ax)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    return visualizer
Ejemplo n.º 17
0
    def residuals_plot(self) -> None:
        """Plot the difference between the observed value of the target variable (y)
        and the predicted value (ŷ), i.e. the error of the prediction"""

        visualizer = ResidualsPlot(self.trained_model)
        visualizer.fit(self.X_train,
                       self.y_train)  # Fit the training data to the visualizer
        visualizer.score(self.X_test,
                         self.y_test)  # Evaluate the model on the test data
        save_dir = f"{self.plots_dir}/residuals_plot_{self.model_id}.png"
        visualizer.show(outpath=save_dir)
        if not LOCAL:
            upload_to_s3(save_dir,
                         f'plots/residuals_plot_{self.model_id}.png',
                         bucket=S3_BUCKET_NAME)
        plt.clf()
Ejemplo n.º 18
0
def visualize_pred_residuals(X_train, X_test, y_train, y_test):
    model = linear_model.Ridge(alpha=0.05)
    fitted = model.fit(X_train, y_train)
    visualizer = ResidualsPlot(fitted, size=(1080, 720))
    pred = fitted.predict(X_test)
    r = stats.linregress(pred, y_test)
    print(r[2])
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.poof()
    cvr = model_selection.cross_validate(model,
                                         X_test,
                                         y_test,
                                         cv=10,
                                         return_train_score=True)
    print('Training scores:', cvr['train_score'], '\n')
    print('Testing scores:', cvr['test_score'])
Ejemplo n.º 19
0
def my_residual_plot(X_train, y_train, X_test, y_test):
    plt.figure(figsize=(20, 5))
    plt.grid(True)

    visualizer = ResidualsPlot(LinearRegression(), hist=False)

    visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data

    ticks = np.arange(1000, max(y_test.values) + 1, 500)

    plt.title("Wykres rezyduów", fontsize=25)
    plt.xlabel("Ceny mieszkań", fontsize=15)
    plt.ylabel("Rezydua", fontsize=15)

    plt.plot(ticks, np.zeros(len(ticks)), "r")
    plt.legend()
    plt.show()
def generate_ordinal_diagnostics(x, y, current_best_model, label_type,
                                 diagnostic_image_path):
    x = np.array(x)
    y = np.array(y)
    kf = KFold(n_splits=10, shuffle=True)
    guesses = []
    for train_index, test_index in kf.split(x):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]
        model = current_best_model[0].fit(X_train, y_train)
        for guess in zip(y_test.tolist(), model.predict(X_test).tolist()):
            guesses.append(guess)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    if "VotingClassifier" not in str(current_best_model[0].__class__):
        visualizer = ResidualsPlot(current_best_model[0])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.poof(outpath=diagnostic_image_path + "/residuals_plot.png")
        plt.clf()
        visualizer = PredictionError(current_best_model[0])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.poof(outpath=diagnostic_image_path +
                        "/prediction_error.png")
        plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2)
    visualizer.fit_transform(x, y)
    print(diagnostic_image_path + "/pca_2.png")
    visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png")
    plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3)
    visualizer.fit_transform(x, y)
    visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png")
    plt.clf()
    return {
        "mse": mean_squared_error(*np.array(guesses).transpose()),
        "r2": r2_score(*np.array(guesses).transpose()),
        "mae": median_absolute_error(*np.array(guesses).transpose()),
        "evs": explained_variance_score(*np.array(guesses).transpose()),
        "rmse": np.sqrt(mean_squared_error(*np.array(guesses).transpose()))
    }
Ejemplo n.º 21
0
def showResiduals():
    # Load the data
    df = load_data('concrete')
    feature_names = [
        'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'
    ]
    target_name = 'strength'

    # Get the X and y data from the DataFrame
    X = df[feature_names].as_matrix()
    y = df[target_name].as_matrix()

    # Create the train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Instantiate the linear model and visualizer
    ridge = Ridge()
    visualizer = ResidualsPlot(ridge)

    visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data
    g = visualizer.poof()  # Draw/show/poof the data
def ridge_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a ridge regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = RidgeCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
    print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Ridge picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Ridge Model")
        plt.show()
        # Visualizing the regression
        visualizer = ResidualsPlot(reg, size=(1080, 720))
        visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        visualizer.show()                 # Finalize and render the figure
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Ridge Regression",
        "R squared": reg.score(X_test, y_test),
        "R squared training": reg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }
Ejemplo n.º 23
0
class PrincipalComponentRegressor(Regressor):
    def __init__(self, n_components):
        super().__init__()
        self.n_components = n_components
        self.regressor = LinearRegression()
        self.pca = None

    def fit(self, x_train, y_train, standardize=False):
        self.pca = PCA(self.n_components)
        self.x_train = self.pca.fit_transform(x_train)
        self.y_train = y_train
        self.regressor.fit(self.x_train, self.y_train)
        self._inference()
        return self.regressor.intercept_, self.regressor.coef_, self.p, self.regressor.score(self.x_train, y_train)

    def predict(self, x_test):
        try:
            x_test_transform = self.pca.transform(x_test)
        except ValueError:
            x_test_transform = x_test
        prediction = self.regressor.predict(x_test_transform)
        return prediction

    def residual_plot(self, x_test=None, y_test=None):
        if self.standardize:
            x_test = self.standardizescaler.transform(x_test)
        try:
            self.residual_visualizer = ResidualsPlot(self.regressor)
        except yellowbrick.exceptions.YellowbrickTypeError:
            self.residual_visualizer = ResidualsPlot(self.regressor.regressor)

        self.residual_visualizer.fit(self.x_train, self.y_train)
        if x_test is not None and y_test is not None:
            try:
                self.residual_visualizer.score(x_test, y_test)
            except ValueError:
                x_test = self.pca.transform(x_test)
                self.residual_visualizer.score(x_test, y_test)
        self.residual_visualizer.poof()
Ejemplo n.º 24
0
class RandForestRegressor(Regressor):
    def __init__(self):
        super().__init__()
        self.regressor = RandomForestRegressor()

    def fit(self, x_train, y_train, standardize=False):
        self.standardize = standardize
        if self.standardize:
            self.standardizescaler.fit(x_train)
            x_train = self.standardizescaler.transform(x_train)

        self.x_train = x_train
        self.y_train = y_train
        self.regressor.fit(self.x_train, self.y_train.ravel())
        self._inference()
        return self.rsquared

    def residual_plot(self, x_test=None, y_test=None):
        if self.standardize:
            x_test = self.standardizescaler.transform(x_test)
        try:
            self.residual_visualizer = ResidualsPlot(self.regressor)
        except yellowbrick.exceptions.YellowbrickTypeError:
            self.residual_visualizer = ResidualsPlot(self.regressor.regressor)

        y_train = self.y_train.ravel()
        self.residual_visualizer.fit(self.x_train, y_train)
        if x_test is not None and y_test is not None:
            y_test = y_test.ravel()
            self.residual_visualizer.score(x_test, y_test)
        self.residual_visualizer.poof()

    def predict(self, x_test):
        if self.standardize:
            x_test = self.standardizescaler.transform(x_test)
        return self.regressor.predict(x_test).reshape(-1, 1)
Ejemplo n.º 25
0
mse = np.mean((pred - y_test)**2)

mse  
## calculating score
ridgeReg.score(X_test,y_test) 


from yellowbrick.regressor import ResidualsPlot

# Instantiate the linear model and visualizer
ridge = Ridge()
visualizer = ResidualsPlot(ridge)

visualizer.fit(X_train, y_train)  # Fit the training data to the model
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.poof()        
       

##Apply different algos as on X_train,X_test,y_train,y_test

# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
pred_y = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
Ejemplo n.º 26
0
#load data
visualizations = load_dataset(file_name=config.TRAINING_DATA_FILE)

#set X and y
#adjust X based on feature set to use from config.py (TOP5_FEATURES or FEATURES)
X = visualizations[config.TOP5_FEATURES]
y = visualizations[config.TARGET]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

#yellowbrick ResidualsPlotVisualization visual
visualizer = ResidualsPlot(config.BEST_MODEL)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show(outpath="visualizations/ResidualsPlotVisualization.pdf")
visualizer.show(outpath="visualizations/ResidualsPlotVisualization.png")
visualizer.show()

#yellowbrick prediction error visual
visualizer = PredictionError(config.BEST_MODEL)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show(outpath="visualizations/PredictionErrorVisualization.pdf")
visualizer.show(outpath="visualizations/PredictionErrorVisualization.png")
visualizer.show()
Ejemplo n.º 27
0
lr = LinearRegression().fit(X_train, y_train)

lr.score(X_test, y_test)


### Yellowbrick

from yellowbrick.regressor import PredictionError, ResidualsPlot

## RVF plot

# Run the following together

lr_yb = ResidualsPlot(lr, hist=True)
lr_yb.fit(X_train, y_train)
lr_yb.score(X_test, y_test)
lr_yb.poof()

## Prediction Error plot

lr_yb = PredictionError(lr, hist=True)
lr_yb.fit(X_train, y_train)
lr_yb.score(X_test, y_test)
lr_yb.poof()



################ Polynomial/Interactions ################


from sklearn.pipeline import make_pipeline
Ejemplo n.º 28
0
def show_residuals_plot(model, X_train, y_train, X_valid, y_valid):
    residuals_plot = ResidualsPlot(model)
    residuals_plot.fit(X_train, y_train)
    residuals_plot.score(X_valid, y_valid)
    residuals_plot.show()
Ejemplo n.º 29
0
    def binary_class(self, type, target, duplicated, sep, exclude,
                     max_runtime_secs):

        img = plt.figure()
        self.write_image(img, 'blank', width=600, height=500)

        self.gstep(0, "Reading Dataset")

        buffer = io.StringIO()
        self.dfo.columns = [c.replace(' ', '_') for c in self.dfo.columns]

        self.gstep(1, "Verify if duplicated")
        self.insert_text(
            "shape",
            str(self.dfo.shape[0]) + ' / ' + str(self.dfo.shape[1]))
        self.get_classes(self.dfo, target)
        self.insert_text("nclasses", str(self.nclasses))
        self.insert_text("allclasses", str(self.allclasses))
        shape_before = self.dfo.shape[0]
        if duplicated:
            self.dfo = self.dfo.drop_duplicates(self.dfo.columns)
            shape_after = self.dfo.shape[0]
        if shape_before == shape_after:
            self.insert_text("duplicated", "none")
        else:
            self.insert_text("duplicated", str(shape_after - shape_before))

        if exclude != 'none':
            self.dfo.drop(columns=exclude, inplace=True)

        self.gstep(1, "Detecting hi frequency features")
        exclude = self.hi_freq(self.dfo)
        self.dfo.drop(columns=exclude['Feature'], inplace=True)

        hi_freq = self.w_table(data=exclude,
                               border=0,
                               align='left',
                               collapse='collapse',
                               color='black',
                               foot=False)
        self.insert_text("excluded", hi_freq)

        self.gstep(1, "Encoding as sort_by_response")
        self.dfo_encode = self.encode(self.dfo.copy())

        self.gstep(1, "Basic Informations")

        df_info = pd.DataFrame()
        for column in self.dfo.columns:
            not_null = int(self.dfo.shape[0] -
                           int(self.dfo[column].isna().sum()))
            dtype = self.dfo[column].dtypes
            df_info = df_info.append(
                {
                    'column': column,
                    'not_null': not_null,
                    'dtype': dtype
                },
                ignore_index=True)
        df_info['not_null'] = df_info['not_null'].apply(lambda x: int(x))
        df_info['percent'] = df_info['not_null'].apply(
            lambda x: float("{:.4f}".format(1 - (x / self.dfo.shape[0]))))
        info_dataset = self.w_table(data=df_info,
                                    border=0,
                                    align='left',
                                    collapse='collapse',
                                    color='black',
                                    foot=False)
        self.insert_text("info_dataset", info_dataset)

        self.gstep(1, "Computing Regression")

        Y = self.dfo_encode[target]
        dfo_num = self.dfo_encode[self.dfo_encode._get_numeric_data().columns]
        X = dfo_num.drop(columns=[target])

        # Criando os dados de train e test
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=42)

        cols = X.columns
        formule = " + ".join(map(str, cols))
        formule = target + " ~ " + formule
        reg = smf.ols(formule, data=dfo_num)
        res = reg.fit()
        self.insert_text('regression', str(res.summary()))

        self.gstep(1, "Unbalance Classes")

        temp = self.dfo[target].value_counts()
        df = pd.DataFrame({target: temp.index, 'values': temp.values})
        plt.figure(figsize=(6, 6))
        plt.title('Data Set - target value - data unbalance\n (' + target +
                  ')')
        sns.set_color_codes("pastel")
        sns.barplot(x=target, y="values", data=df)
        locs, labels = plt.xticks()
        self.write_image(plt, "unbalance", width=500, height=350, crop=True)

        self.gstep(1, "Correlation")

        plt.clf()
        corr = self.dfo_encode.corr()
        mask = np.zeros_like(corr, dtype=bool)
        mask[np.triu_indices_from(mask)] = True
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        plt.figure(figsize=(8, 8))
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr,
                    mask=mask,
                    cmap=cmap,
                    vmax=1,
                    vmin=-1,
                    center=0,
                    annot=True,
                    square=True,
                    linewidths=1.5,
                    cbar_kws={"shrink": .5})
        self.write_image(plt, "corr", width=0, height=0, crop=True)

        self.gstep(1, "Detecting Multicollinearity with VIF")

        y = self.dfo_encode[target]
        y = y.apply(lambda x: 1 if x == 'yes' else 0)
        X = self.dfo_encode.drop(target, axis=1)
        X = X[X._get_numeric_data().columns]
        X = X.fillna(0)
        X = X.dropna()
        vif = [
            variance_inflation_factor(X.values, i) for i in range(X.shape[1])
        ]
        cols = X.columns
        cols = cols[cols != target]
        df_m = pd.DataFrame({'cols': cols, 'vif': vif})
        df_m['significant'] = ''
        df_m['significant'] = df_m['vif'].apply(self.parse_values)
        m_vif = self.w_table(data=df_m,
                             border=0,
                             align='left',
                             collapse='collapse',
                             color='black',
                             foot=False)
        self.insert_text("vif", str(m_vif))

        i = 2
        text = ''
        text2 = ''
        for column in self.dfo.columns:
            feature = self.dfo[column].describe()
            text = text + '<option value="' + str(
                i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t'
            text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue == '" + str(
                i
            ) + "') {\n\t\t\t\t\t\t\t\tdivElement.innerHTML = '" + pd.DataFrame(
                feature).to_html().replace('\n', '') + "';\n\t\t\t\t\t\t\t\t"
            i = i + 1
        text2 = text2 + '\n\t\t\t\t\t\t\t\t};'
        self.insert_text('vif_desc_option', text)
        self.insert_text('vif_desc_table', text2)

        self.gstep(1, "Residual Analisys")

        plt.clf()
        model = Ridge()
        visualizer = ResidualsPlot(model, hist=False, qqplot=True)
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        self.write_image(plt, "residual1", width=500, height=350, crop=True)
        plt.clf()
        visualizer = ResidualsPlot(model, hist=True, qqplot=False)
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        self.write_image(plt, "residual2", width=500, height=350, crop=True)

        self.gstep(1, "Initializing H2O")
        h2o.init()
        self.gstep(1, "Parsing Data Frame")
        df = h2o.H2OFrame(self.dfo_encode)
        self.gstep(1, "Trainning Auto Machine Learning")
        train, valid, test = df.split_frame(ratios=[0.7, 0.2], seed=1234)
        x = train.columns
        y = target
        x.remove(y)
        train[y] = train[y].asfactor()
        test[y] = test[y].asfactor()
        aml = H2OAutoML(max_models=20,
                        max_runtime_secs=max_runtime_secs,
                        seed=1,
                        include_algos=[
                            "GLM", "DeepLearning", "DRF", "xGBoost",
                            "StackedEnsemble"
                        ],
                        balance_classes=True)
        aml.train(x=x, y=y, training_frame=train)

        lb = h2o.automl.get_leaderboard(aml, extra_columns='ALL')
        lb = lb.as_data_frame()
        lb = lb.drop(columns=['rmse', 'mse', 'predict_time_per_row_ms'])
        text = self.w_table(lb)
        self.insert_text('auto_ml_results', text)
        self.write_image(aml.varimp_heatmap(),
                         'var_imp_model',
                         width=450,
                         height=400,
                         crop=True)

        self.gstep(1, "AML - Partial Dependence")

        i = 101
        text = ''
        text2 = ''
        for column in tqdm(self.dfo.columns):
            feature = self.dfo[column].describe()
            text = text + '<option value="' + str(
                i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t'
            text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue2 == '" + str(
                i
            ) + "'){\n\t\t\t\t\t\t\t\tdivElement2.innerHTML = '<img src=\"images/img_aml_pd_" + str(
                i) + ".png\">';\n\t\t\t\t\t\t\t\t"
            self.write_image(aml.pd_multi_plot(valid, column),
                             'aml_pd_' + str(i),
                             width=600,
                             height=500)
            i = i + 1
        text2 = text2 + '\n\t\t\t\t\t\t\t\t};'
        self.insert_text('aml_pd_option', text)
        self.insert_text('aml_pd_image', text2)

        self.gstep(1, "Trainning (GLM) Gradient Linear Model to Ensemble")

        nfolds = 5
        family = "binomial"

        amlr_glm = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            lambda_=0,
            max_runtime_secs=max_runtime_secs,
            balance_classes=True,
            fold_assignment="Modulo",
            compute_p_values=True,
            keep_cross_validation_predictions=True,
            remove_collinear_columns=True)
        amlr_glm.train(x, y, training_frame=train)

        self.gstep(1, "Trainning (DRF) Dynamic Random Forest to Ensemble")
        amlr_rf = H2ORandomForestEstimator(
            ntrees=50,
            nfolds=nfolds,
            fold_assignment="Modulo",
            max_runtime_secs=max_runtime_secs,
            balance_classes=True,
            keep_cross_validation_predictions=True,
            seed=1)
        amlr_rf.train(x=x, y=y, training_frame=train)

        self.gstep(
            1, "Trainning (GBM) Gradient Boost Estimator Model to Ensemble")
        amlr_gbm = H2OGradientBoostingEstimator(
            nfolds=nfolds,
            seed=1111,
            balance_classes=True,
            fold_assignment="Modulo",
            max_runtime_secs=max_runtime_secs,
            keep_cross_validation_predictions=True)
        amlr_gbm.train(x=x, y=y, training_frame=train)

        self.gstep(1, "Trainning xGBoost Model to Ensemble")
        amlr_xgb = H2OXGBoostEstimator(booster='dart',
                                       nfolds=nfolds,
                                       normalize_type="tree",
                                       fold_assignment="Modulo",
                                       max_runtime_secs=max_runtime_secs,
                                       keep_cross_validation_predictions=True,
                                       seed=1234)
        amlr_xgb.train(x=x, y=y, training_frame=train, validation_frame=valid)

        self.gstep(1, "Trainning Deep Learning Model to Ensemble")

        family = "bernoulli"
        dl_model = H2ODeepLearningEstimator(distribution=family,
                                            hidden=[1],
                                            epochs=1000,
                                            train_samples_per_iteration=-1,
                                            reproducible=True,
                                            activation="Tanh",
                                            single_node_mode=False,
                                            balance_classes=True,
                                            force_load_balance=False,
                                            seed=23123,
                                            tweedie_power=1.5,
                                            max_runtime_secs=max_runtime_secs,
                                            score_training_samples=0,
                                            score_validation_samples=0,
                                            stopping_rounds=0)
        dl_model.train(x=x, y=y, training_frame=train)

        self.gstep(1, "Trainning Ensemble")
        ensemble = H2OStackedEnsembleEstimator(
            model_id="amlr_ensemble",
            base_models=[amlr_gbm, amlr_rf, amlr_xgb, amlr_glm])
        ensemble.train(x=x, y=y, training_frame=train)

        i = 201
        text = ''
        text2 = ''
        self.gstep(1, "Ensamble - (ICE) Individual Condition Expectation")
        for column in tqdm(self.dfo.columns):
            feature = self.dfo[column].describe()
            text = text + '<option value="' + str(
                i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t'
            text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue3 == '" + str(
                i
            ) + "'){\n\t\t\t\t\t\t\t\tdivElement3.innerHTML = '<img src=\"images/img_ice_pd_" + str(
                i) + ".png\">';\n\t\t\t\t\t\t\t\t"
            self.write_image(ensemble.ice_plot(valid, column),
                             'ice_pd_' + str(i),
                             width=600,
                             height=500)
            i = i + 1
        text2 = text2 + '\n\t\t\t\t\t\t\t\t};'
        self.insert_text('ice_pd_option', text)
        self.insert_text('ice_pd_image', text2)

        self.gstep(1, "AMLR - Correlation by Model")
        self.write_image(aml.model_correlation_heatmap(test),
                         'aml_correlation_models')

        self.gstep(1, "Processing Models Performance")

        i = 0
        dfp = pd.DataFrame({'Algo': []})
        outcome = list(valid[target].as_data_frame()[target])
        for algo in [
                'GLM', 'Random Forest', 'GBM', 'xGBoost', 'Deep Learning'
        ]:
            plt.clf()
            if algo == 'GLM':
                predict = list(
                    amlr_glm.predict(valid).as_data_frame()['predict'])
                cf_table = 'cf_glm'
                cm_glm = ConfusionMatrix(outcome, predict)
                glm_var_imp = amlr_glm._model_json['output'][
                    'variable_importances'].as_data_frame()
                x = glm_var_imp['percentage']
                x.index = glm_var_imp['variable']
                x.sort_values().plot(kind='barh')
                plt.xlabel('Percentage')
                fig = plt.gcf()
                self.write_image(fig, 'fi_glm', width=450, height=450)

            if algo == 'Random Forest':
                predict = list(
                    amlr_rf.predict(valid).as_data_frame()['predict'])
                cf_table = 'cf_rf'
                cm_rf = ConfusionMatrix(outcome, predict)
                rf_var_imp = amlr_rf._model_json['output'][
                    'variable_importances'].as_data_frame()
                x = rf_var_imp['percentage']
                x.index = rf_var_imp['variable']
                x.sort_values().plot(kind='barh')
                plt.xlabel('Percentage')
                fig = plt.gcf()
                self.write_image(fig, 'fi_rf', width=450, height=450)
            if algo == 'GBM':
                predict = list(
                    amlr_gbm.predict(valid).as_data_frame()['predict'])
                cf_table = 'cf_gbm'
                cm_gbm = ConfusionMatrix(outcome, predict)
                gbm_var_imp = amlr_gbm._model_json['output'][
                    'variable_importances'].as_data_frame()
                x = gbm_var_imp['percentage']
                x.index = gbm_var_imp['variable']
                x.sort_values().plot(kind='barh')
                plt.xlabel('Percentage')
                fig = plt.gcf()
                self.write_image(fig, 'fi_gbm', width=450, height=450)
            if algo == 'xGBoost':
                predict = list(
                    amlr_xgb.predict(valid).as_data_frame()['predict'])
                cf_table = 'cf_xgb'
                cm_xgb = ConfusionMatrix(outcome, predict)
                xgb_var_imp = amlr_xgb._model_json['output'][
                    'variable_importances'].as_data_frame()
                x = xgb_var_imp['percentage']
                x.index = xgb_var_imp['variable']
                x.sort_values().plot(kind='barh')
                plt.xlabel('Percentage')
                fig = plt.gcf()
                self.write_image(fig, 'fi_xgb', width=450, height=450)
            if algo == 'Deep Learning':
                predict = list(
                    dl_model.predict(valid).as_data_frame()['predict'])
                cf_table = 'cf_dl'
                cm_dl = ConfusionMatrix(outcome, predict)
                dl_var_imp = dl_model._model_json['output'][
                    'variable_importances'].as_data_frame()
                x = dl_var_imp['percentage']
                x.index = dl_var_imp['variable']
                x.sort_values().plot(kind='barh')
                plt.xlabel('Percentage')
                fig = plt.gcf()
                self.write_image(fig, 'fi_dl', width=450, height=450)
            # Confusion Matrix for all models
            cm = confusion_matrix(predict, outcome)
            cm = pd.DataFrame(cm)
            cr = classification_report(outcome,
                                       predict,
                                       target_names=self.allclasses,
                                       output_dict=True)
            table_cr = pd.DataFrame(cr).transpose().round(4)
            table_cr.reset_index(level=0, inplace=True)
            table_cr = table_cr.rename(columns={'index': 'Description'})
            table_model = self.w_table(data=table_cr,
                                       border=0,
                                       align='left',
                                       collapse='collapse',
                                       color='black',
                                       foot=False)
            self.insert_text(cf_table, str(table_model))

            # Statistcs for all metrics
            cm = ConfusionMatrix(outcome, predict)
            dfp = pd.concat([dfp, pd.DataFrame(cm.overall_stat)[1:]],
                            ignore_index=True)
            dfp.loc[i:, ['Algo']] = algo
            i = i + 1
        dfp = dfp.round(4)

        cp = Compare({
            'RF': cm_rf,
            'GLM': cm_glm,
            'GBM': cm_gbm,
            'XGB': cm_xgb,
            'DL': cm_dl
        })
        cp_best_name = cp.best_name
        cp = pd.DataFrame(cp.scores)
        cp.reset_index(level=0, inplace=True)
        cp = cp.rename(columns={'index': 'Description'})
        table_cp = self.w_table(data=cp,
                                border=0,
                                align='left',
                                collapse='collapse',
                                color='black',
                                foot=False)
        if str(cp_best_name) == 'None':
            cp_best_name = 'Confusion matrices are too close and the best one can not be recognized.'
            max_v = cp.loc[0][1:].max()
            i = 0
            list_max = list()
            for column in cp.columns:
                if i > 0:
                    if cp[column][0] >= max_v:
                        list_max.append(column)
                i = i + 1
            self.insert_text(
                "the_best_name",
                "Winners: " + ' - '.join(list_max) + '<br>' + cp_best_name)

        else:
            self.insert_text("the_best_name", str(cp_best_name))

        self.insert_text("best_algorithms", str(table_cp))
        self.insert_text("the_best_name", str(cp_best_name))

        table_model = self.w_table(data=dfp,
                                   border=0,
                                   align='left',
                                   collapse='collapse',
                                   color='black',
                                   foot=False)
        self.insert_text("table_performance", str(table_model))

        self.gstep(1, "Closing!! All works are done!!")
        # write report
        self.write_report(self.index_html)
print(f"Train R2 is {lr_log.score(X=X_train_log, y=y_train_log)}")
print(f"Test R2 is {lr_log.score(X=X_test_log, y=y_test_log)}")

# There is a slight improvement (~2%) in the train R2 and test R2 utilizing log transform

# + [markdown] pycharm={"name": "#%% md\n"}
# ## Model Evaluation - Linear Regression
# ### The following section evaluates the random error, constant variance and normal distribution with mean 0 assumption of linear model in the context of the four initial models utilizing a residual plot from Yellowbrick.
#
# -

# Residual Plot for Huber LR with no log-transform
from yellowbrick.regressor import ResidualsPlot
rpv_hr = ResidualsPlot(hr)
rpv_hr.fit(X=X_train, y=y_train)
rpv_hr.score(X=X_test, y=y_test)
rpv_hr.poof()

rpv_lr = ResidualsPlot(lr)
rpv_lr.fit(X=X_train, y=y_train)
rpv_lr.score(X=X_test, y=y_test)
rpv_lr.poof()

# Residual Plot for LR with log transform
rpv_lr_log = ResidualsPlot(lr_log)
rpv_lr_log.fit(X=X_train_log, y=y_train_log)
rpv_lr_log.score(X=X_test_log, y=y_test_log)
rpv_lr_log.poof()

# + [markdown] pycharm={"name": "#%% md\n"}
# ## Model Evaluation of Ordinary Least Squares -Log Transform
Ejemplo n.º 31
0
st.write(
    eli5.formatters.as_dataframe.explain_weights_df(
        estimator=model_lr, feature_names=feature_names)[['feature',
                                                          'weight']])
'''
Koefisien yang paling besar dari model adalah GrLivArea sebesar 0.3154, artinya harga rumah sensitif dengan kolom ini. Apabila
terjadi peningkatan terhadap nilai GrLivArea, harga rumah akan meningkat lebih tinggi dibandingkan apabila terjadi kenaikan pada feature yang lain dengan kenaikan yang sama.
Perhatikan juga terdapat feature dengan nilai koefisien yang negatif (ExterQual_TA dan ExterQual_Fa), artinya apabila feature ini meningkat maka harga rumah akan menjadi lebih turun.
'''
'''
#### 2. Residual Plot
'''
st.write('')
visualizer_residual = ResidualsPlot(model_lr)
visualizer_residual.fit(X_train, y_train)
visualizer_residual.score(X_test, y_test)
visualizer_residual.finalize()

st.pyplot()
'''
Residual berdistribusi paling banyak pada nilai 0. Akan tetapi, masih terdapat nilai residual yang cukup tinggi. Hal ini menyebabkan distribusi dari residual tidak sepenuhnya normal, tetapi menjadi skew.
'''
'''
#### 3. Prediction Error
'''

st.write('')
visualizer_prediction_error = PredictionError(model_lr)
visualizer_prediction_error.fit(X_train, y_train)
visualizer_prediction_error.score(X_test, y_test)
visualizer_prediction_error.finalize()