Esempio n. 1
0
def fit_ridge_cv(theta_matrix, X_matrix, alpha=[0]):
    reg = RidgeCV(alphas=alpha, fit_intercept=False)
    reg.fit(theta_matrix, X_matrix)
    gamma_vector = reg.coef_
    loss = np.mean(np.square(reg.predict(theta_matrix) - X_matrix))
    score = reg.score(theta_matrix, X_matrix)
    return gamma_vector, loss, score, reg.alpha_
Esempio n. 2
0
def Ridge_model(train_linear, test_linear):
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(train_linear_fea, train_linear_tar)
    ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    #ridge.set_params(alpha=6,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl')
    return test_prediction_ridge
    
    
    def fitModel(self, Xindex, Findex, X, Y):
        if not hasattr(self, "model"):
            self.model = []

        if "ridge_alpha" in self.args:
            alpha = self.args['ridge_alpha']
        else:
            ridgecv = RidgeCV(alphas=(10., 50., 100.))
            ridgecv.fit(X, Y[:, 0])
            logging.info("Ridge cv %f", ridgecv.alpha_)
            logging.info("Ridge score %f", ridgecv.score(X, Y[:, 0]))
            alpha = ridgecv.alpha_

        global ridgeX, ridgeY, ridgeAlpha
        ridgeX = X
        ridgeY = Y
        ridgeAlpha = alpha

        for idx in self.divide(list(range(len(self.model), Y.shape[1])), 18):
            with mp.Pool(6) as pool:
                self.model += pool.map(train_ridge, idx)
            logging.info("Ridge group %d", idx[0])
            self.saveModel()

        logging.info("Ridge ok")
Esempio n. 4
0
	def fitRidge(self, X, Y, name="ridge"):
		if not hasattr(self, name):
			ridge = []
			setattr(self, name, ridge)
		else:
			ridge = getattr(self, name)

		if "ridge_alpha" in self.args:
			alpha = self.args['ridge_alpha']
		else:
			ridgecv = RidgeCV(alphas=(10., 50., 100.))
			ridgecv.fit(X, Y[:, 24])
			logging.info("Ridge cv %f", ridgecv.alpha_)
			logging.info("Ridge score %f", ridgecv.score(X, Y[:, 24]))
			alpha = ridgecv.alpha_

		global ridgeX, ridgeY, ridgeAlpha
		ridgeX = X
		ridgeY = Y
		ridgeAlpha = alpha

		for idx in self.divide(list(range(len(ridge), Y.shape[1])), 18):
			with mp.Pool(6) as pool:
				ridge += pool.map(train_ridge, idx)
			logging.info("Ridge group %d", idx[0])
			self.saveModule(name, False)

		logging.info("Ridge ok")
Esempio n. 5
0
    def train_rigdeCV(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training Ridge...')
        start_time = self.timer()

        ridge = RidgeCV(normalize=True, alphas=[0.0000999], cv=10)
        ridge.fit(x_tr, y_tr)
        print("The R2 is: {}".format(ridge.score(x_tr, y_tr)))
        print("The alpha choose by CV is:{}".format(ridge.alpha_))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(ridge.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/RidgeCV.pkl', 'wb') as f:
            pickle.dump(ridge, f)

        print('Making prediction and saving into a csv')
        y_test = ridge.predict(self.x_test)

        return y_test
Esempio n. 6
0
def Ridge_Data(context,x,y):
    x,y = data_process(x,y)
    # standardize independent variables and response variable (necessary for ridge regularization)
    from sklearn.preprocessing import StandardScaler
    x_std = StandardScaler().fit_transform(x)
    y_std = StandardScaler().fit_transform(y.reshape(-1,1))
    
    
    # Ridge Regression RidgeCV() default:
    # alphas=(0.1, 1.0, 10.0)
    # fit_intercept=True
    # cv=None: to use the efficient Leave-One-Out cross-validation
    # cv=int: to specify the number of folds
    from sklearn.linear_model import RidgeCV
    ridgecv = RidgeCV()
    ridgecv.fit(x_std, y_std)
    ridgecv_score = ridgecv.score(x_std, y_std)
    # ridge panelty: alpha
    ridgecv_alpha = ridgecv.alpha_
    
    print('Ridge R square', ridgecv_score)
    print('Ridge Alpha', ridgecv_alpha )
    print('Ridge Coefficients',ridgecv.coef_)

    # Estimated coefficients are the same!!
    import math
    k = len(x_std[0])
    y_ridge = np.append(y_std,np.zeros(k))
    x_ridge = np.append(x_std,np.identity(k)*math.sqrt(ridgecv_alpha),axis=0)
Esempio n. 7
0
def ridge_regression_cv(problem, **kwargs):
    r"""High level description.

    Parameters
    ----------
    problem : type
        Description
    kwargs : dictionary
        kwargs['ridge_reg_coefs'] must be a list of nonnegative float.  These
        are the multipliers for the penalty term in cross-validation of ridge
        regression
        kwargs['coef_tolerance'] must be a nonnegative float

    Returns
    -------
    output : tuple
        (optimum, maximum)

    """
    data_list = [datum['data']['values'] for datum in problem.data]
    data = numpy.array(data_list)
    ridge = RidgeCV(kwargs['ridge_reg_coefs'])
    ridge.fit(data.T, problem.goal['data']['values'])
    ridge_regression_coefficients = ridge.coef_
    optimum = [problem.data[index] for index,element in
               enumerate(ridge_regression_coefficients)
               if abs(element) > kwargs['coef_tolerance']]
    maximum = ridge.score(data.T, problem.goal['data']['values'])
    output = (optimum, maximum)
    return output
Esempio n. 8
0
def Ridge_model(train_linear, test_linear):
    ridgecv = RidgeCV(alphas=np.logspace(-5, 4, 400))
    ridgecv.fit(train_linear_fea, train_linear_tar)
    ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ", ridgecv_score)
    coef = pd.Series(ridgecv.coef_,
                     index=x_train.columns).sort_values(ascending=False)

    start = time.time()
    ridge = Ridge(normalize=True)
    ridge.set_params(alpha=ridgecv_alpha, max_iter=10000)
    #ridge.set_params(alpha=6,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end = time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge = pd.Series(ridgecv.coef_,
                           index=x_train.columns).sort_values(ascending=False)
    evaluate(ridge, x_test, y_test, x_train, y_train)
    print('Time elapsed: %.4f seconds' % (end - start))

    y_ridge_predict = ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line = x_line
    plt.scatter(real_train_tar, np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')

    test_prediction_ridge = np.expm1(ridge.predict(test_linear))
    write_pkl(
        ridgecv_alpha,
        '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl'
    )
    return test_prediction_ridge
Esempio n. 9
0
def regularizedreg(Xtrain,Xtest,ytrain,ytest):
    Rclf = RidgeCV(alphas=[1,2,20,40,50]) # RidgeCV(alphas=[0.1, 1.0, 2.0, 4.0, 20.0], cv=None, fit_intercept=True, scoring=None, normalize=False)
    Rclf.fit(Xtrain,ytrain);
    print("Residual sum of squares: %.2f"
         % np.mean((Rclf.predict(Xtest) - ytest) ** 2))
    print('Regularization choosen, alpha = %.2f' % Rclf.alpha_);
    print(' Coef values = ', Rclf.coef_);                                      
    print('Variance score: %.2f' % Rclf.score(Xtest, ytest))
Esempio n. 10
0
def forregCV(X, y, regressor):
    kf = RepeatedKFold(n_splits=5, n_repeats=30)

    if regressor == ridge_regression:
        print("ridge")
        regressor = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=kf).fit(X, y)
        print('Ridge CV: ', regressor.score(X, y))
        return
    if regressor == lasso_regression:
        print("lasso")
        regressor = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=kf).fit(X, y)
        print('Lasso CV: ', regressor.score(X, y))
        return
    #REPEATED KFOLD CROSS-VAL
#    midl = []
    scores = cross_val_score(regressor, X, y, cv=kf, n_jobs=-1)
    print("---> ", scores.mean())
Esempio n. 11
0
def RidgeKFold(data=data, city='all', label="label_activity_density"):

    if city == 'all':
        data2 = data.copy()
    else:
        data2 = data[data["city_district"].str.contains(city)].copy()

    target = data2[["city_district", label]]
    features = data2[features_columns]

    X = features.values
    y = target[label].values

    clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
    print(clf.score(X, y))

    return (clf.score(X, y))
Esempio n. 12
0
def ridge_reg(x, y):
    ridgecv = RidgeCV(alphas=(0.1, 10, 50), cv=20)
    ridgecv.fit(x, y)
    ridgecv_score = ridgecv.score(x, y)
    ridgecv_alpha = ridgecv.alpha_
    print('Ridge R square', ridgecv_score)
    print('Ridge Alpha', ridgecv_alpha)
    return ridgecv.coef_
Esempio n. 13
0
 def feature_ridge(self):
     model = RidgeCV()
     model.fit(self.x, self.y)
     coefficients = pd.Series(model.coef_, index=self.x.columns)
     print("Beta weights/co-efficients (L2 regularisation)")
     print("-----------------------------------------")
     print(coefficients)
     print('\n')
     print('R2 score is {}'.format(model.score(self.x, self.y)))
def linear_reg_single_meter(X_train, X_test, y_train, y_test):
    # Fit your model using the training set
    linear = LinearRegression()
    lasso_cv = LassoCV(cv=5, random_state=0)
    ridge_cv = RidgeCV(alphas=(0.1, 1.0, 10.0))
    linear.fit(X_train, y_train)
    lasso_cv.fit(X_train, y_train)
    ridge_cv.fit(X_train, y_train)
    print("Variance Inflation Factors")
    print(vifs(X_test))
    print('\n')
    print('Features')
    print('\n')
    print(list(X_test.columns))
    print(
        'Linear regression score on train set with all parameters: {}'.format(
            linear.score(X_train, y_train)))
    print('Linear regression score on test set with all parameters: {}'.format(
        linear.score(X_test, y_test)))
    # print('Linear regression crossVal score on train set with all parameters: {}'.format(linear.score(X_train, y_train)))
    # print('Linear regression crossVal score on test set with all parameters: {}'.format(linear.score(X_test, y_test)))

    print(
        'LassoCV regression score on train set with all parameters: {}'.format(
            lasso_cv.score(X_train, y_train)))
    print(
        'LassoCV regression score on test set with all parameters: {}'.format(
            lasso_cv.score(X_test, y_test)))
    # print('LassoCV regression crossVal score on train set with all parameters: {}'.format(lasso_cv.score(X_train, y_train)))
    # print('LassoCV regression crossVal score on test set with all parameters: {}'.format(lasso_cv.score(X_test, y_test)))

    print(
        'RidgeCV regression score on train set with all parameters: {}'.format(
            ridge_cv.score(X_train, y_train)))
    print(
        'RidgeCV regression score on test set with all parameters: {}'.format(
            ridge_cv.score(X_test, y_test)))
    # print('RidgeCV regression crossVal score on train set with all parameters: {}'.format(ridge_cv.score(X_train, y_train)))
    # print('RidgeCV regression crossVal score on test set with all parameters: {}'.format(ridge_cv.score(X_test, y_test)))

    return ridge_cv, lasso_cv, linear
Esempio n. 15
0
def calculate_slopes(data, osc_idx, osc_idx_type, N):

    # Initialization of an array where values are saved
    slope = np.zeros((data.shape[0], osc_idx.shape[0]))
    pval = np.zeros((data.shape[0], osc_idx.shape[0]))
    rsquared = np.zeros((data.shape[0], 1))

    # If, crop yield data exists, calculate the slope coefficient for each oscillation and FPU
    # using regularized ridge regression.
    for k in range(0, slope.shape[0]):
        y = data[k, :]
        if all(~np.isnan(y)):
            if 'multiv' in osc_idx_type:
                X0 = np.hstack(
                    (osc_idx.T, np.ones((osc_idx[1, :][:, np.newaxis].shape))))
                ridge_model = RidgeCV(
                    alphas=[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]).fit(
                        X0, y)
                coeffs = ridge_model.coef_
                rsquared_temp = ridge_model.score(X0, y)

                coeffs_sample = np.empty((0, 4))
                for j in range(0, N):
                    rand_idx_data = np.random.choice(data.shape[1],
                                                     data.shape[1],
                                                     replace=True)
                    y_sample = y[rand_idx_data]
                    X0_sample = X0[rand_idx_data, :]
                    ridge_model_sample = RidgeCV(
                        alphas=[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10
                                ]).fit(X0_sample, y_sample)
                    coeffs_sample = np.vstack(
                        (coeffs_sample,
                         ridge_model_sample.coef_[np.newaxis, :]))


# Calculate how many of the sampled slope values are
# smaller or larger than zero.
                bstrp_test = np.zeros((2, coeffs.shape[0]))
                bstrp_test[0, :] = np.sum(coeffs_sample > 0, axis=0)
                bstrp_test[1, :] = np.sum(coeffs_sample < 0, axis=0)
                pvals = (1 - np.amax(bstrp_test, axis=0) / N) * 2.0
                # Select the maximum of the array that stores the number of sampled correlations
                # larger or smaller than zero, and divide that with the total number of samples.
                # Then substract this value from 1, and multiply this value by two (two-sided test) to get the p-value.
                slope[k, :] = coeffs[0:3]
                pval[k, :] = pvals[0:3]
                rsquared[k, 0] = rsquared_temp

        else:
            slope[k, 0] = np.nan

    return slope, pval, rsquared
def ridge_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a ridge regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = RidgeCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
    print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Ridge picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Ridge Model")
        plt.show()
        # Visualizing the regression
        visualizer = ResidualsPlot(reg, size=(1080, 720))
        visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        visualizer.show()                 # Finalize and render the figure
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Ridge Regression",
        "R squared": reg.score(X_test, y_test),
        "R squared training": reg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }
def scale_test_and_train_ridge(X, y):
    """
    Run a ridge regression on the model
    """
    X, X_test, y, y_test = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.25,
                                                      random_state=3)

    X_train_scale = X_train.values
    X_val_scale = X_val.values
    X_test_scale = X_test.values

    scale = StandardScaler()

    X_train_scale = scale.fit_transform(X_train_scale)
    X_test_scale = scale.transform(X_test_scale)
    X_val_scale = scale.transform(X_val_scale)

    ridge = RidgeCV(cv=5)
    ridge.fit(X_train_scale, y_train)

    ridge.score(X_train_scale, y_train)

    y_pred = ridge.predict(X_val_scale)

    print(f'Ridge Regression val R^2: {ridge.score(X_val_scale, y_val):.3f}')
    print(
        f'Ridge Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}'
    )

    return ridge.coef_
Esempio n. 18
0
def doRidgeRegressionCV(X_train, y_train, X_test, y_test):
    print('Now doing Ridge cross validation')
    alpha_ridge = [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000]
    rf = RidgeCV(alphas=alpha_ridge,
                 store_cv_values=True).fit(X_train, y_train)
    print(rf.score(X_test, y_test))
    cv_mse = np.mean(rf.cv_values_, axis=0)
    print("alphas: %s" % alpha_ridge)
    print("CV MSE: %s" % cv_mse)
    print("Best alpha using built-in RidgeCV: %f" % rf.alpha_)

    plt.plot(np.array(alpha_ridge, dtype="float64"),
             np.array(cv_mse, dtype="float64")[0])
    plt.xlabel('Value of alpha for Ridge')
    plt.ylabel('Cross-validated MSE')
Esempio n. 19
0
 def build_ts_regression(self, feature_list, target, dt_index, model_list=['ridge']):
     """
     Takes features and target
     Does train test split
     Reports on preformance 
     Returns: model
     """
     
     # Fix missing values
     print('\n[INFO] Imputing missing values')
     self.fix_missing()
     print('\nMissing Values:')
     print(bo.df.isna().sum())
     
     test_size = 0.3
     
     self.df.sort_values(dt_index, ascending=True, inplace=True)
     nrows = self.df.shape[0]
     train_idx = int(nrows*(1-test_size))
     test_idx = nrows - train_idx
             
     X = self.df[feature_list]
     y = self.df[target]
     
     self.X_train = X.iloc[0:train_idx]
     self.X_test =  X.iloc[0:test_idx]    
     self.y_train = y.iloc[0:train_idx]
     self.y_test =  y.iloc[0:test_idx]    
     
     print('Xtrain size:', self.X_train.shape[0], 'Xtest size:', self.X_test.shape[0])
     
     for m in model_list:
         if m == 'ridge':
             from sklearn.linear_model import RidgeCV
             # Choosing a CV number
             if self.df.shape[0] > 100:
                 cv = 3
             elif self.df.shape[0] > 500:
                 cv = 5
             else:
                 cv = 1
             
             model = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=cv)
             model.fit(self.X_train, self.y_train)
             print('\nRidge Regression R-squared:', model.score(self.X_test, self.y_test))    
             # Add the model to the output list
             self.models.append(model)
Esempio n. 20
0
def run():
    # Data preprocessing
    train = DataPrep.prep_data(headless_run)
    # Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use

    target = train.SalePrice
    train = train.drop(columns='SalePrice')

    X_train, X_test, y_train, y_test = train_test_split(
        train, target, test_size=0.25, random_state=0)

    # Trying L2 regularization
    clf = RidgeCV(cv=5).fit(X_train, y_train)
    # print(rmse_cv(clf).mean())

    # Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value
    coef = pd.Series(clf.coef_, index=X_train.columns)

    # Metrics
    variance_score = clf.score(X_test, y_test)
    MSEscore = mean_squared_error(clf.predict(X_test), y_test)
    MAEscore = median_absolute_error(clf.predict(X_test), y_test)
    R2score = r2_score(clf.predict(X_test), y_test)

    if not headless_run:
        print('Variance score: {}'.format(variance_score))
        # print("CLF best: {}".format(clf.best_score_)) grid search only
        print('MSE score: {}'.format(MSEscore))
        print('MAE score: {}'.format(MAEscore))
        print('R2 score: {}'.format(R2score))

        # Plotting Residuals

        plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train,
                    color="green", s=10, label='Train data')

        plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test,
                    color="blue", s=10, label='Test data')

        plt.hlines(y=0, xmin=10, xmax=14, linewidth=2)

        plt.legend(loc='upper right')
        plt.title("Residual errors")
        plt.show()
    else:
        return [variance_score,MSEscore,MAEscore,R2score]
Esempio n. 21
0
def ridge_regression(y_train, x_train, df_test):
    ridge = RidgeCV(
        alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
    ridge.fit(x_train, y_train)
    alpha = ridge.alpha_
    ridge = RidgeCV(alphas=[
        alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8,
        alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1,
        alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
    ],
                    cv=10)
    ridge.fit(x_train, y_train)
    alpha = ridge.alpha_
    print('ALPHA', alpha)
    acc_log = round(ridge.score(x_train, y_train) * 100, 2)
    print('SCORE', acc_log)
    pred = ridge.predict(df_test)
    return pred
Esempio n. 22
0
def ridge_boston():
    boston = load_boston()
    x = boston.data
    y = boston.target
    train_x, test_x, train_y, test_y = \
        train_test_split(x, y, test_size=.25)
    std_s = StandardScaler()
    train_x = std_s.fit_transform(train_x)
    test_x = std_s.fit_transform(test_x)

    # ridge = Ridge(alpha=1.5)
    ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=4)
    ridge.fit(train_x, train_y)
    score = ridge.score(test_x, test_y)
    predict_y = ridge.predict(test_x)
    print(score)
    print(predict_y[:20])
    print(test_y[:20])
    return None
Esempio n. 23
0
class RidgeWithPost(BaseEstimator, TransformerMixin):
    def __init__(self, weight=1.0):
        self.ridge = RidgeCV(weight)

    def fit(self, X, y, sample_weight=None):
        self.ridge.fit(X, y)
        return self

    def predict(self, X):
        y = self.ridge.predict(X)
        ranged = np.empty(len(y))
        for i in range(0, len(y)):
            if y[i] < 18:
                ranged[i] = 18
            else:
                ranged[i] = y[i]
        return ranged

    def score(self, X, y, sample_weight=None):
        return self.ridge.score(X, y)
Esempio n. 24
0
def ridge_reg():
    from sklearn.linear_model import RidgeCV
    n_alphas = 100
    alpha_vals = np.logspace(-1, 3, n_alphas)
    rr = RidgeCV(alphas=alpha_vals, cv=10)
    rr.fit(X_train_scaled, y_train)
    y_pred_train = rr.predict(X_train_scaled)
    #y_pred_train_round = np.round(y_pred_train)
    y_pred_test = rr.predict(X_test_scaled)
    #y_pred_test_round = np.round(y_pred_test)
    print(rr.alpha_)
    print(rr.score(X_test_scaled, y_test))
    #plot_conf_mat(y_test, _pred_round)
    global metrics_ridge
    metrics_ridge = [
        accuracy_score(y_test, np.round(y_pred_test)),
        mean_squared_error(y_test, y_pred_test),
        r2_score(y_test, y_pred_test)
    ]
    return scores_results(y_train, y_test, y_pred_train, y_pred_test)
Esempio n. 25
0
    def NLP(self):
        """Performs a linear regression on a TF-IDF matrix of property titles vs. revenue potential"""

        # load and vectorize titles
        corpus = self.comps['title']
        target = self.comps['rev_pot']
        vec = TfidfVectorizer(tokenizer=self.spacy_tokenizer,
                              max_features=15,
                              max_df=1.0,
                              ngram_range=(1, 1))
        matrix = vec.fit_transform(corpus)

        # perform ridge regression and return dataframe of coefficients
        ls = RidgeCV()
        ls.fit(matrix, target)
        coefficients_df = pd.DataFrame.from_dict(
            dict(zip(vec.get_feature_names(),
                     ls.coef_)), orient='index').sort_values(by=0)
        score = ls.score(matrix, target)
        print(f"R^2 = {score: .3f} \n Alpha: {ls.alpha_ : .3f}")

        return coefficients_df
Esempio n. 26
0
    def eval_score(self, X, n):
        """ RidgeCV
            Parameters
            -------------
            X: pandas dataframe
            n: train_test_splitの回数

            Return
            -------------
            score: average score
        """
        scores = []
        for _ in range(n):
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                self.y,
                                                                test_size=0.4)
            model = RidgeCV()
            model.fit(X_train, y_train)
            scores.append(model.score(X_test, y_test))

        score = np.array(scores).mean()
        return score
Esempio n. 27
0
    def build_regression(self, feature_list, target, model_list=['ridge']):
        """
        Takes features and target
        Does train test split
        Reports on preformance 
        Returns: model
        """

        # Fix missing values
        print('\n[INFO] Imputing missing values')
        self.fix_missing()
        print('\nMissing Values:')
        print(self.df.isna().sum())

        seed = 4784

        from sklearn.model_selection import train_test_split
        X = self.df[feature_list]
        y = self.df[target]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=seed)

        for m in model_list:
            if m == 'ridge':
                from sklearn.linear_model import RidgeCV
                # Choosing a CV number
                if self.df.shape[0] > 100:
                    cv = 3
                elif self.df.shape[0] > 500:
                    cv = 5
                else:
                    cv = 1

                model = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=cv)
                model.fit(self.X_train, self.y_train)
                print('\nRidge Regression R-squared:', model.score(self.X_test, self.y_test))
                # Add the model to the output list
                self.models.append(model)
Esempio n. 28
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
Esempio n. 29
0
X = X[:-predPeriod] #re-sizing the features for training
dataset.dropna(inplace=True) # get rid of naN for 'label' column

# create label 
y = np.array(dataset['label'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1)

# use linearRegression as algrithm
#clf = LinearRegression()
clf = RidgeCV (alphas =[0.1, 0.5, 1, 10])
clf.fit(X_train, y_train)
#start_time = time.time()
y_pred = clf.predict(X_pred)
#print time.time() - start_time
accuracy = clf.score(X_test, y_test)
# visualize Learning Curves
#ML.ModelLearning(X, y)
#ML.ModelComplexity(X_train, y_train)

#Linear slope calculation
#print clf.alpha_
#print clf
#print clf.coef_
#print clf.intercept_
print 'predict accuracy is: {:0.2f}'.format(accuracy)


# build a column in data for predict result
data['predict/Adj Close'] = data['Adj Close'] # add column for predict value/Adj Close
# RidgeCV Regression with 10 fold cross-validation along alpha values of 0.1, 1 and 10

Ridge_CV = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=10)
Ridge_CV.fit(x_train, y_train)
predicted_Ridge_CV = Ridge_CV.predict(x_test)

plt.scatter(y_test, predicted_Ridge_CV)
plt.plot([-1, 5], [-1, 5], "g--", lw=1, alpha=0.4)

plt.xlabel("True prices (EUR)")
plt.ylabel("Predicted prices (EUR)")

plt.text(
    -1, 2.5,
    ' R-squared = {}'.format(round(float(Ridge_CV.score(x_test, y_test)), 2)))
plt.text(
    -1, 3, ' MSE = {}'.format(
        round(float(mean_squared_error(y_test, predicted_Ridge_CV)), 2)))
plt.title(
    'Ridge (Alpha = {}) - Predicted prices (EUR) vs. True prices (EUR)'.format(
        Ridge_CV.alpha_))
plt.show()

# 10 folds cross-validation along the previous Ridge regression

ridge = Ridge(alpha=0.1)
shuffle = KFold(n_splits=10, shuffle=True, random_state=0)
cv_scores = cross_val_score(ridge, x, y, cv=shuffle)
print(cv_scores)
print(cv_scores.mean())
model1 = LinearRegression().fit(X_m1,y_m)
print(f"R2 Score: {model1.score(X_m1,y_m)}")

"""## Regularization
1. Lasso
2. Ridge
3. ElasticNet

### Ridge
"""

# higher the alpha value, more restriction on the coefficients; 
# lower the alpha > more generalization, coefficients are barely
rr = RidgeCV(cv=5,fit_intercept=False) 
rr.fit(X_m, y_m)
rr.score(X_m,y_m)

rr.alpha_

plt.plot(rr.coef_,alpha=0.7,marker='*',markersize=10,color='red',label=r'Ridge; $\alpha =10$') 
plt.grid(True)
plt.xticks(range(0,28,1))
plt.legend()
plt.show()

"""# Model Accuracy Metrics

You must use the Mean Squared Error & Mean Absolute Error for your model evaluations. You may also include extra metrics for calculating the scores.
"""

def MSE(model_preds, ground_truths):
print("---------------------------------------------------------------------")

#岭回归,参数估计,固定岭参数
X = dfx.iloc[:, 0:5]
#print(X)
y = dfy_scaled
reg01 = Ridge(alpha=0.15).fit(X, y)
print('Ridge(alpha=0.15) score:', reg01.score(X, y).round(5))  #0.98513
print('Ridge(alpha=0.15) coefficients:', reg01.coef_.round(5),
      '\n')  #[-0.05087 0.54623 0.39501 -0.12857 -0.03614]
print("---------------------------------------------------------------------")

#岭回归,按 CV 标准自动选择岭参数
alphas = np.linspace(0.0001, 0.5, 1000)
reg02 = RidgeCV(alphas).fit(X, y)
print('RidgeCV score:', reg02.score(X, y).round(5))
print('RidgeCV alpha:', reg02.alpha_.round(5))  #0.33737
print('RidgeCV coefficients:', reg02.coef_.round(5), '\n')
print("---------------------------------------------------------------------")

#lasso求解
count = 0
lamb = 0.05
lasso_reg = Lasso(alpha=lamb)
lasso_reg.fit(dfx, dfy)
print('Lasso Intercept:', lasso_reg.intercept_)
print('Lasso Coef:', '\n', lasso_reg.coef_)
#print(type(lasso_reg.coef_))
for n in lasso_reg.coef_:
    if ((n > 1e-5) or (n < -1e-5)):
        count = count + 1
##################
# CLASSIFICATION #
##################

cv = 5
clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
outer_results = list()
outer_scores = list()
for train_ix, test_ix in cv_outer.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    outer_results.append(clf.fit(X_train, y_train))
    outer_scores.append(clf.score(X_test, y_test))

#################
# PRINT RESULTS #
#################
for i_split, results in enumerate(outer_results):
    coefs_abs = np.abs(results.coef_)
    IX_coefs = np.argsort(-coefs_abs)
    coef_abs_sorted = coefs_abs[IX_coefs]
    print('Split ', i_split + 1)
    print(results.alpha_)
    print(1 + IX_coefs[:10])
    print(coef_abs_sorted[:10])

print(outer_scores)
print(np.mean(outer_scores))
print(ridge)
print("Percent variance explained: {0}".format(ridge.score(X_aging, y_aging)))
print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True)))
print("ORDINARY LEAST SQUARES")
print(ols)
print("Percent variance explained: {0}".format(ols.score(X_aging, y_aging)))
print("Coefficients found: \n{0}\n".format(prettyprint(ols.coef_, col, sort=True)))
print("WHOLE DATASET //////////////////////////")

print("SUPER AGERS //////////////////////////")
ridge = RidgeCV(alphas=alpha_params, cv=7, scoring=score)
ridge.fit(X_sa, y_sa)
ols = LinearRegression()
ols.fit(X_sa, y_sa)
print("RIDGE REGRESSION")
print("Percent variance explained: {0}".format(ridge.score(X_sa, y_sa)))
print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True)))
print("ORDINARY LEAST SQUARES")
print("Percent variance explained: {0}".format(ols.score(X_sa, y_sa)))
print("Coefficients found: \n{0}\n".format(prettyprint(ols.coef_, col, sort=True)))
print("SUPER AGERS //////////////////////////")

print("MCIS //////////////////////////")
ridge = RidgeCV(alphas=alpha_params, cv=7, scoring=score)
ridge.fit(X_mci, y_mci)
ols = LinearRegression()
ols.fit(X_mci, y_mci)
print("RIDGE REGRESSION")
print("Percent variance explained: {0}".format(ridge.score(X_mci, y_mci)))
print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True)))
print("ORDINARY LEAST SQUARES")