Ejemplo n.º 1
0
    def __init__(self, X, y, linear_modelcv, name):
        self.X = X.copy()
        self.X_std = None
        self.y_std = None
        self.y = y.copy()
        self.model_cv = linear_modelcv
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.type = self.model_cv.__class__.__name__
        self.name = self.type
        self.hyperparameters = pd.Series()

        self.scaler = XyScaler()
Ejemplo n.º 2
0
 def __init__(self, data, model, predict, name):
     super().__init__(data, predict)
     self.model = model
     self.name = name
     self.alphas = None
     self.log_alphas = None
     self.scaler = XyScaler()
     self.columns = None
Ejemplo n.º 3
0
def lasso_model(X_train, X_hold, y_train, y_hold):
    lasso_alphas = np.logspace(-2, 2, num=40)
    lasso_cv_errors_train, lasso_cv_errors_test = train_at_various_alphas(
        X_train.values, y_train.values, Lasso, lasso_alphas)
    lasso_mean_cv_errors_train = lasso_cv_errors_train.mean(axis=0)
    lasso_mean_cv_errors_test = lasso_cv_errors_test.mean(axis=0)
    lasso_optimal_alpha = get_optimal_alpha(lasso_mean_cv_errors_test)
    #plot_mean_CV_error(lasso_mean_cv_errors_train, lasso_mean_cv_errors_test, lasso_alphas,
    #lasso_optimal_alpha,'Optimal Lasso-Alpha Level')

    standardizer = XyScaler()
    standardizer.fit(X_train.values, y_train.values)
    X_train_std, y_train_std = standardizer.transform(X_train.values,
                                                      y_train.values)
    X_hold_std, y_hold_std = standardizer.transform(X_hold.values,
                                                    y_hold.values)
    final_lasso = Lasso(alpha=lasso_optimal_alpha).fit(X_train_std,
                                                       y_train_std)
    y_hold_pred_std = final_lasso.predict(X_hold_std)
    final_lasso_mse = mse(y_hold_std, y_hold_pred_std)
    r2 = r2_score(y_hold_std, y_hold_pred_std)
    ress = y_hold_std - y_hold_pred_std
    print("Lasso R2 score is: ", r2)
    print("Final Lasso RSS: ", final_lasso_mse)
    print("Optimal Lasso Alpha: ", lasso_optimal_alpha)
    return (final_lasso, y_hold_std, y_hold_pred_std, ress, final_lasso_mse)
Ejemplo n.º 4
0
def cross_valid(X, y, base_estimator, n_folds, random_seed=154):
    models = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    test_cv_errors, train_cv_errors = np.empty(n_folds), np.empty(n_folds)
    for idx, (train, test) in enumerate(kf.split(X)):
        # Split into train and test
        X_cv_train, y_cv_train = X[train], y[train]
        X_cv_test, y_cv_test = X[test], y[test]

        # Standardize data.
        standardizer = XyScaler()
        standardizer.fit(X_cv_train, y_cv_train)
        X_cv_train_std, y_cv_train_std = standardizer.transform(
            X_cv_train, y_cv_train)
        X_cv_test_std, y_cv_test_std = standardizer.transform(
            X_cv_test, y_cv_test)

        # Fit estimator
        estimator = clone(base_estimator)
        estimator.fit(X_cv_train_std, y_cv_train_std)

        # Measure performance
        y_hat_train = estimator.predict(X_cv_train_std)
        y_hat_test = estimator.predict(X_cv_test_std)

        # Calclate the error metrics
        train_cv_errors[idx] = mse(y_cv_train_std, y_hat_train)
        test_cv_errors[idx] = mse(y_cv_test_std, y_hat_test)

    return train_cv_errors, test_cv_errors
Ejemplo n.º 5
0
def ridge_model(X_train, X_hold, y_train, y_hold):
    ridge_alphas = np.logspace(-2, 4, num=40)
    ridge_cv_errors_train, ridge_cv_errors_test = train_at_various_alphas(
        X_train.values, y_train.values, Ridge, ridge_alphas)
    ridge_mean_cv_errors_train = ridge_cv_errors_train.mean(axis=0)
    ridge_mean_cv_errors_test = ridge_cv_errors_test.mean(axis=0)
    ridge_optimal_alpha = get_optimal_alpha(ridge_mean_cv_errors_test)
    #plot_mean_CV_error(ridge_mean_cv_errors_train, ridge_mean_cv_errors_test, ridge_alphas, ridge_optimal_alpha,
    #'Optimal Ridge-Alpha Level')

    standardizer = XyScaler()
    standardizer.fit(X_train.values, y_train.values)
    X_train_std, y_train_std = standardizer.transform(X_train.values,
                                                      y_train.values)
    X_hold_std, y_hold_std = standardizer.transform(X_hold.values,
                                                    y_hold.values)
    final_ridge = Ridge(alpha=ridge_optimal_alpha).fit(X_train_std,
                                                       y_train_std)
    y_hold_pred_std = final_ridge.predict(X_hold_std)
    final_ridge_mse = mse(y_hold_std, y_hold_pred_std)
    r2 = r2_score(y_hold_std, y_hold_pred_std)
    ress = y_hold_std - y_hold_pred_std
    print("Ridge R2 score is: ", r2)
    print("Final Ridge RSS: ", final_ridge_mse)
    print("Optimal Ridge Alpha: ", ridge_optimal_alpha)
    return (final_ridge, y_hold_std, y_hold_pred_std, ress, final_ridge_mse,
            X_hold_std)
def cv(X, y, base_estimator, n_folds, random_seed=None):
    """Estimate the in- and out-of-sample error of a model using cross
    validation.
    
    Parameters
    ----------
    
    X: np.array
      Matrix of predictors.
      
    y: np.array
      Target array.
      
    base_estimator: sklearn model object.
      The estimator to fit.  Must have fit and predict methods.
      
    n_folds: int
      The number of folds in the cross validation.
      
    random_seed: int
      A seed for the random number generator, for repeatability.
    
    Returns
    -------
      
    train_cv_errors, test_cv_errors: tuple of arrays
      The training and testing errors for each fold of cross validation.
    """
    kf = KFold(n_splits=n_folds)

    test_cv_errors, train_cv_errors = np.zeros(n_folds), np.zeros(n_folds)

    X = np.array(X)
    y = np.array(y)

    for idx, (train, test) in enumerate(kf.split(X_train)):
        # Split into train and test
        X_cv_train, y_cv_train = X[train], y[train]
        X_cv_test, y_cv_test = X[test], y[test]

        # Standardize data.
        standardizer = XyScaler()
        standardizer.fit(X_cv_train, y_cv_train)
        X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train)
        X_cv_test_std, y_cv_test_std = standardizer.transform(X_cv_test, y_cv_test)

        # Fit estimator
        model = base_estimator
        model.fit(X_cv_train_std, y_cv_train_std)

        # Measure performance
        y_hat_train = model.predict(X_cv_train_std)
        y_hat_test = model.predict(X_cv_test_std)

        # Calclate the error metrics
        train_cv_errors[idx] = mean_squared_error(y_cv_train_std, y_hat_train)
        test_cv_errors[idx] = mean_squared_error(y_cv_test_std, y_hat_test)

    return train_cv_errors, test_cv_errors
Ejemplo n.º 7
0
def linear_model(X_train, X_hold, y_train, y_hold):
    ln = LinearRegression()
    linear_cv_errors_train, linear_cv_errors_test = cross_valid(
        X_train.values, y_train.values, ln, 10)

    linear_mean_cv_errors_train = linear_cv_errors_train.mean(axis=0)
    linear_mean_cv_errors_test = linear_cv_errors_test.mean(axis=0)

    standardizer = XyScaler()
    standardizer.fit(X_train.values, y_train.values)
    X_train_std, y_train_std = standardizer.transform(X_train.values,
                                                      y_train.values)
    X_hold_std, y_hold_std = standardizer.transform(X_hold.values,
                                                    y_hold.values)

    final_linear = LinearRegression().fit(X_train_std, y_train_std)
    y_hold_pred_std = final_linear.predict(X_hold_std)
    final_linear_mse = mse(y_hold_std, y_hold_pred_std)
    r2 = r2_score(y_hold_std, y_hold_pred_std)
    ress = y_hold_std - y_hold_pred_std
    print("Linear R2 Score: ", r2)
    print("Final Linear MSE: ", final_linear_mse)
    return (final_linear, y_hold_std, y_hold_pred_std, ress, final_linear_mse)
Ejemplo n.º 8
0
class LinearDataset:
    """Adds functionalities to linear_model"""
    def __init__(self, X, y, linear_modelcv, name):
        self.X = X.copy()
        self.X_std = None
        self.y_std = None
        self.y = y.copy()
        self.model_cv = linear_modelcv
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.type = self.model_cv.__class__.__name__
        self.name = self.type
        self.hyperparameters = pd.Series()

        self.scaler = XyScaler()

    def scale_X(self):
        """Scales X and y"""
        self.scaler.fit(self.X, self.y)
        X_std, y_std = self.scaler.transform(self.X, self.y)
        self.X_std = pd.DataFrame(data=X_std,
                                  columns=self.X.columns,
                                  index=self.X.index)
        self.y_std = pd.Series(data=y_std)

    def add_constant(self):
        self.X['constant'] = 1

    def goldfeldtquandt(self):
        """Conducts a Goldfeldt-Quandt test for heteroscedasticity with the null hypothesis that errors are normally distributed"""
        het_F_stat, het_p_stat, z = sm.stats.diagnostic.het_goldfeldquandt(
            self.y, self.X)
        return {"F": het_F_stat, "p": het_p_stat}

    def vif(self):
        """Returns the variance inflation factor for dataframe of features to test for multicolinearity, scores of 5 and up indicate multicolinearity"""
        vif = pd.DataFrame()
        vif["Features"] = self.X_std.columns
        vif["VIF Factor"] = [
            variance_inflation_factor(self.X_std.values, i)
            for i in range(self.X_std.shape[1])
        ]
        return vif

    def test_split(self, ratio=0.25):
        """Splits a training set with a given ratio"""
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X_std, self.y_std, test_size=ratio, random_state=1)

    def log_transform_y(self):
        """Log transforms the target vector"""
        self.y = np.log(self.y)

    def get_coefs(self):
        """Returns model coefficients for sklearn models"""
        df = pd.DataFrame(self.model_cv.coef_, index=self.X_train.columns)
        return df

    def fit_cross_val(self, cv=10, alphas=None, l1_ratio=None):
        """Fits sklearn LinearModelCV and adds hyperparameters to object name"""
        self.model_cv.fit(self.X_train, self.y_train)
        if self.name in ['ElasticNetCV', 'RidgeCV', 'LassoCV']:
            self.hyperparameters['a'] = self.model_cv.alpha_
        if self.name == 'ElasticNetCV':
            self.hyperparameters['l1_ratio'] = self.model_cv.l1_ratio_
        self.name += str(dict(self.hyperparameters.round(3)))

    def plot_MSE(self, ax=plt, c_title=''):
        """Plots the mean squared error for various alphas in a cross-validation"""
        alphas = self.model_cv.alphas_
        m_log_alphas = -np.log10(alphas)
        ymin, ymax = self.model_cv.mse_path_.min(
        ) * 0.9, self.model_cv.mse_path_.max() * 1.1
        ax.plot(m_log_alphas, self.model_cv.mse_path_, ':')
        ax.axvline(-np.log10(self.model_cv.alpha_),
                   linestyle='--',
                   color='k',
                   label='alpha: CV estimate')

        ax.legend(list(range(1, self.model_cv.mse_path_.shape[1] + 1)),
                  title='Fold')

        ax.set_xlabel('-log(alpha)')
        ax.set_ylabel('Mean square error')
        ax.set_title(c_title + ' Mean square error on each fold')
        ax.set_ylim(ymin, ymax)

    def plot_coeff_paths(self, ax=plt, c_title=''):
        """Plots coefficient paths for various alphas in a cross-validation"""
        alphas = self.model_cv.alphas_
        m_log_alphas = np.log10(alphas)
        coeffs = self.model_cv.path(self.X_train, self.y_train)[1]
        ymin, ymax = coeffs.min(), coeffs.max()
        ax.plot(m_log_alphas, coeffs.T)
        ax.legend(self.X_train.columns,
                  title='Feature',
                  loc='upper left',
                  bbox_to_anchor=(1, 1))
        ax.set_xlabel('log(alpha)')
        ax.set_title(c_title + ' Coefficient Descent')
        ax.set_ylim(ymin, ymax)

    def plot_actual_predicted(self, ax=plt, y_log=True):
        """Plots model predicted values verus actual values"""
        y_hat_test = self.y_hat_test
        y_test = self.y_test

        if y_log:
            y_test = np.log(self.y_test)
            y_hat_test = np.log(self.y_hat_test)

        ax.scatter(y_test, y_hat_test)

        model_name = self.model_cv.__class__.__name__
        ax.set_title(model_name + ' Actual vs. Predicted')
        ax.set_ylabel('Actual')
        ax.set_xlabel('Predicted')
        plt.subplots_adjust(hspace=.300, wspace=.200)

    def _rss(self, y, y_hat):
        """Returns the residual sum of squares"""
        return np.mean((y - y_hat)**2)

    def predict(self):
        """Returns standardized and unstandardized predictions"""

        self.y_hat_train = self.model_cv.predict(self.X_train)
        self.y_hat_test = self.model_cv.predict(self.X_test)

        self.X_train_unstandardized, self.y_hat_train_unstandardized = self.scaler.inverse_transform(
            self.X_train, self.y_hat_train)

        self.X_test_unstandardized, self.y_hat_test_unstandardized = self.scaler.inverse_transform(
            self.X_test, self.y_hat_test)

        self.X_test_unstandardized, self.y_test_unstandardized = self.scaler.inverse_transform(
            self.X_test, self.y_test)

    def test_and_train_errs(self):
        """Returns the residual sum of squares for training and test sets"""

        rss_train = self._rss(self.y_train, self.y_hat_train)
        rss_test = self._rss(self.y_test, self.y_hat_test)

        rss_train_unstandardized = self._rss(self.y_train,
                                             self.y_hat_train_unstandardized)
        rss_test_unstandardized = self._rss(self.y_test,
                                            self.y_hat_test_unstandardized)

        r2_train = self.model_cv.score(self.X_train, self.y_train)
        r2_test = self.model_cv.score(self.X_test, self.y_test)

        return [
            r2_train, r2_test, rss_train, rss_test, rss_train_unstandardized,
            rss_test_unstandardized
        ]

    def find_residuals(self):
        """Returns difference of actual and predicted targets for training"""
        return self.y_train - self.model_cv.predict(self.X_train)

    def plot_qqplot(self):
        """Creates quantile-quantile plots bases on the residuals"""
        qqplot(self.find_residuals())

    def set_up(self, y_log=True, ratio=0.30):
        """Automates processing steps"""
        if y_log:
            self.log_transform_y()
        self.scale_X()
        self.add_constant()
        self.test_split(ratio=ratio)
        self.fit_cross_val()
        self.predict()
def train_at_various_alphas(X, y, model, alphas, n_folds=10, **kwargs):
    """Train a regularized regression model using cross validation at various
    values of alpha.
    
    Parameters
    ----------
    
    X: np.array
      Matrix of predictors.
      
    y: np.array
      Target array.
      
    model: sklearn model class
      A class in sklearn that can be used to create a regularized regression
      object.  Options are `Ridge` and `Lasso`.
      
    alphas: numpy array
      An array of regularization parameters.
      
    n_folds: int
      Number of cross validation folds.
      
    Returns
    -------
    
    cv_errors_train, cv_errors_test: tuple of DataFrame
      DataFrames containing the training and testing errors for each value of
      alpha and each cross validation fold.  Each row represents a CV fold, and
      each column a value of alpha.
    """
    cv_errors_train = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                     columns=alphas)
    cv_errors_test = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                        columns=alphas)
    X = np.array(X)
    y = np.array(y)
    for alpha in alphas:
        kf = KFold(n_splits=n_folds)
        
        for idx, (train, test) in enumerate(kf.split(X)):
    
            X_cv_train, y_cv_train = X[train], y[train]
            X_cv_test, y_cv_test = X[test], y[test]
      
            standardizer = XyScaler()
            standardizer.fit(X_cv_train, y_cv_train)
            X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train)
            X_cv_test_std, y_cv_test_std = standardizer.transform(X_cv_test, y_cv_test)
            
        
            m = model(alpha = alpha)
            m.fit(X_cv_train_std, y_cv_train_std)
        
            y_hat_train = m.predict(X_cv_train_std)
            y_hat_test = m.predict(X_cv_test_std)
        
            cv_errors_train[alpha][idx] = mean_squared_error(y_cv_train_std, y_hat_train)
            cv_errors_test[alpha][idx] = mean_squared_error(y_cv_test_std, y_hat_test)

    return cv_errors_train, cv_errors_test
avg_errors_train = ridge_cv_errors_train.mean()
avg_errors_test = ridge_cv_errors_test.mean()
avg_errors_lasso_train = lasso_cv_errors_train.mean()
avg_errors_lasso_test = lasso_cv_errors_test.mean()
#Calculate Optimal Alpha
ridge_optimal_alpha = get_optimal_alpha(avg_errors_test) #Optimal Alpha: 11.51395399
lasso_optimal_alpha = get_optimal_alpha(avg_errors_lasso_test) #Optimal Alpha: .0001


#### Graphing Lambdas####

#Placeholder Dataframe for coefs
features = X.columns
df_coefs = pd.DataFrame(np.empty(shape=(len(ridge_alphas), X.shape[1])), columns = features, index = ridge_alphas)

standardizer = XyScaler()
X_train = np.array(X_train)
y_train = np.array(y_train)
standardizer.fit(X_train, y_train)
X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train)

#Re-run and get coefs
for idx, alpha in enumerate(ridge_alphas):
    model = Ridge(alpha = alpha)        
    model.fit(X_cv_train_std, y_cv_train_std)
    df_coefs.iloc[idx] = model.coef_

#Plot
fig, ax = plt.subplots(figsize = (30,30))
for col in df_coefs.columns:
    ax.plot(np.log(df_coefs.index), df_coefs[col], label = str(col))