def validate(nPrev, nAfter, aux_temp, aux_sun, aux_prec, get_model=False):
    X_Final = getFeature(nPrev, nAfter, aux_temp, aux_sun, aux_prec, TrainFiles)
    data_train_target = pd.read_csv(TrainTarget, sep='\t', header=None)
    y = data_train_target.loc[:,0].values

    TEST_SIZE = 0.2
    RANDOM_STATE = 0
    X_train, X_val, y_train, y_val = train_test_split(X_Final, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    imp.fit(X_train)
    X_train = imp.transform(X_train)
    imp.fit(X_val)
    X_val = imp.transform(X_val)

    reg = RidgeCV()
    reg.fit(X_train, y_train)
    y_val_pred = reg.predict(X_val)
    print mean_squared_error(y_val, y_val_pred)
    
    if get_model:
        imp.fit(X_Final)
        X_Final = imp.transform(X_Final)
        reg_submit = RidgeCV()
        reg_submit.fit(X_Final, y)
        return reg_submit
    return mean_squared_error(y_val, y_val_pred)
Esempio n. 2
0
def ridge_predict(train_data, train_target, test_data):

	# Prep modeller
	alpha_ranges = [1e-3, 1e-2, 1e-1, 1, 1e2, 1e3,
					2e3, 2.5e3, 3e3, 3.5e3, 4e3, 
					5e3, 6e3, 6.1e3, 6.15e3, 6.25e3, 6.3e3, 6.4e3, 7e3, 
					7.75e3, 7.9e3, 8e3, 8.1e3, 8.2e3, 8.25e3, 8.3e3, 8.4e3, 8.5e3, 8.75e3, 9e3, 9.25e3, 9.4e3, 9.5e3, 9.6e3, 9.75e3,
					1e4, 1.25e4, 1.4e4, 1.5e4, 1.55e4, 1.58e4, 1.6e4, 1.625e4, 1.65e4, 1.7e4, 1.725e4, 1.74e4, 1.75e4, 1.76e4, 1.78e4, 1.85e4, 
					2e4, 2.25e4, 2.5e4, 3e4, 4e4,  
					0.5e5, 0.75e5, 1e5, 1.25e5, 1.5e5, 
					0.8e6, 0.9e6, 1e6, 1.1e6, 1.2e6, 1.25e6, 1.28e6, 1.3e6, 1.32e6, 1.33e6, 1.34e6, 1.4e6, 1.5e6, 2e6,
					1e7, 1e8, 1e9, 5e9, 1e10, 5e10, 1e11, 1e12, 1e13]
	clf = RidgeCV(alphas=alpha_ranges, 
              normalize=True, cv=None, fit_intercept=False, store_cv_values=True)

	# Fit
	clf.fit(train_data, train_target)
	# print("alpha range:", alpha_ranges)
	# print("CV per alpha:",np.mean(clf.cv_values_, axis=0))
	# print("alpha used:", clf.alpha_)
	# print("fit score:", clf.score(train_data, train_target))

	# Prediction
	predictions = clf.predict(test_data)

	return predictions
Esempio n. 3
0
def regularizedreg(Xtrain,Xtest,ytrain,ytest):
    Rclf = RidgeCV(alphas=[1,2,20,40,50]) # RidgeCV(alphas=[0.1, 1.0, 2.0, 4.0, 20.0], cv=None, fit_intercept=True, scoring=None, normalize=False)
    Rclf.fit(Xtrain,ytrain);
    print("Residual sum of squares: %.2f"
         % np.mean((Rclf.predict(Xtest) - ytest) ** 2))
    print('Regularization choosen, alpha = %.2f' % Rclf.alpha_);
    print(' Coef values = ', Rclf.coef_);                                      
    print('Variance score: %.2f' % Rclf.score(Xtest, ytest))
Esempio n. 4
0
def fit_Ridge(features_train, labels_train, features_pred, alphas=(0.1, 1.0, 10.0)):
	model = RidgeCV(normalize=True, store_cv_values=True, alphas=alphas)
	model.fit(features_train, labels_train)
	cv_errors = np.mean(model.cv_values_, axis=0)
	print "RIDGE - CV error min: ", np.min(cv_errors)	
	# Test the model
	labels_pred = model.predict(features_pred)
	return labels_pred
Esempio n. 5
0
def create_firststage_preds(train, valid, testing):
    """
    This handles the first stage of a true stacking procedure using
    random forests to create first stage predictions in the train, test,
    and validation. Splits train into two sections, run random forest
    on both and predicts from one half into other (and visa versa). Then
    random forest is run on whole model and predicted into both validation
    and test.
    """
    np.random.seed(42)
    # Get vector of de-dupped values of ids
    id_dat = pd.DataFrame(train["tube_assembly_id"].drop_duplicates())
    # Create random vector to split train val on
    vect_len = len(id_dat.ix[:, 0])
    id_dat["rand_vals"] = np.array(np.random.rand(vect_len, 1))
    df = pd.merge(train, id_dat, on="tube_assembly_id")
    # Create model for both halves of df
    frst1 = RandomForestRegressor(n_estimators=300, n_jobs=7)
    is_first_half = df.rand_vals > 0.5
    is_scnd_half = df.rand_vals < 0.5
    frst1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, "target"])
    frst2 = RandomForestRegressor(n_estimators=300, n_jobs=7)
    frst2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, "target"])
    # Predict frst1 onto forst2 data set and visa versa
    train["forest"] = 0
    train["forest"][is_scnd_half] = frst1.predict(df.ix[is_scnd_half, feats])
    train["forest"][is_first_half] = frst2.predict(df.ix[is_first_half, feats])
    # Create forest in full data for validation and test
    frst = RandomForestRegressor(n_estimators=300, n_jobs=7)
    frst.fit(df[feats], df.target)
    valid["forest"] = frst.predict(valid[feats])
    testing["forest"] = frst.predict(testing[feats])
    # Create model for both halves of df
    rdg1 = RidgeCV(alphas=[0.5, 0.75, 1, 1.25])
    rdg2 = RidgeCV(alphas=[0.5, 0.75, 1, 1.25])
    rdg1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, "target"])
    rdg2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, "target"])
    # Predict frst1 onto forst2 data set and visa versa
    train["ridge"] = 0
    train["ridge"][is_scnd_half] = rdg1.predict(df.ix[is_scnd_half, feats])
    train["ridge"][is_first_half] = rdg2.predict(df.ix[is_first_half, feats])
    # Create forest in full data for validation and test
    rdg = RidgeCV(alphas=[0.5, 0.75, 1, 1.25])
    rdg.fit(df[feats], df.target)
    valid["ridge"] = rdg.predict(valid[feats])
    testing["ridge"] = rdg.predict(testing[feats])
Esempio n. 6
0
def ensemble(Method,alphas,blend_train, blend_test, Y_dev, Y_test, n_folds):
   if (Method==1):
        bclf = RidgeCV(alphas=alphas, normalize=True, cv=n_folds)
        bclf.fit(blend_train, Y_dev)
        print ("Best alpha = ", bclf.alpha_)
        Y_test_predict = bclf.predict(blend_test)
   elif(Method==2):
        bclf = ElasticNetCV(alphas=alphas, normalize=True, cv=n_folds)
        bclf.fit(blend_train, Y_dev)
        print ("Best alpha = ", bclf.alpha_)
        Y_test_predict = bclf.predict(blend_test)
   else:
        bclf = LassoCV(alphas=alphas, normalize=True, cv=n_folds)
        bclf.fit(blend_train, Y_dev)
        print ("Best alpha = ", bclf.alpha_)
        Y_test_predict = bclf.predict(blend_test)
        
   score1 = metrics.mean_absolute_error(Y_test, Y_test_predict)
   score = normalized_gini(Y_test, Y_test_predict)
    
   return score1, score
Esempio n. 7
0
def orth_signal(x, atol=1e-13, rtol=0):
    """
    Returns signal orthogonal to input ensemble.
    x -> input singal [n_samples, n_neurons]
    """
    t = np.linspace(0, 1, x.shape[0])[:, None]
    f = arange(x.shape[1]) / x.shape[1]
    xt = np.sum(sin(2 * np.pi * f * 3 * t) / (f + 1), axis=1)
    w = RidgeCV(np.logspace(-6, 3, 50))
    w.fit(x, xt)
    xt = xt - w.predict(x)
    # pdb.set_trace()
    return xt
Esempio n. 8
0
def RidgeCVLinear(train,test):
  print('starting RidgeCVLinear ...')
  ridge=RidgeCV(normalize=True,cv=5)
  train.reindex(np.random.permutation(train.index))
  tr_X=train.drop('LogSales',axis=1)
  tr_Y=train['LogSales']
  cutoff=math.floor(0.7*tr_Y.size)
  ridge.fit(tr_X[:cutoff],tr_Y[:cutoff])
  predY=ridge.predict(tr_X[cutoff:])
  mspe=rmspe(predY,tr_Y[cutoff:])
  print('rmspe is %9f'% mspe)
  print(train.columns)
  print(ridge.coef_)
  print('starting RidgeCVLinear ... completed')
  return ridge
Esempio n. 9
0
def stacking(estimators):
    # training
    predictions = []
    for estim in estimators:
        estim.fit(X, y)
        predictions.append(estim.predict(X))

    agg = RidgeCV(alphas=alphas, cv=5, normalize=True, fit_intercept=True)         # aggregator
    agg.fit(np.array(predictions).T, y)

    # test
    predictions = []
    for estim in estimators:
        predictions.append(estim.predict(test_data))

    predictions = agg.predict(np.array(predictions).T)
    write_results(predictions)
def ridgeRegression(X,Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for tuned alpha in ridge regression
    """
    tuningAlpha = [0.1,0.01,0.001]

   # can change to model on the entire dataset but by convention splitting the dataset is a better option
   # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size = 0.10, random_state = 5)

    ridge = RidgeCV(normalize=True,scoring='mean_squared_error', alphas=tuningAlpha, cv=10)
    ridge.fit(X, Y)
    prediction = ridge.predict(X)

    print "RIDGE REGRESSION"
    print "Best Alpha value for Ridge Regression : " + str(ridge.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
Esempio n. 11
0
def run():
    # Data preprocessing
    train = DataPrep.prep_data(headless_run)
    # Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use

    target = train.SalePrice
    train = train.drop(columns='SalePrice')

    X_train, X_test, y_train, y_test = train_test_split(
        train, target, test_size=0.25, random_state=0)

    # Trying L2 regularization
    clf = RidgeCV(cv=5).fit(X_train, y_train)
    # print(rmse_cv(clf).mean())

    # Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value
    coef = pd.Series(clf.coef_, index=X_train.columns)

    # Metrics
    variance_score = clf.score(X_test, y_test)
    MSEscore = mean_squared_error(clf.predict(X_test), y_test)
    MAEscore = median_absolute_error(clf.predict(X_test), y_test)
    R2score = r2_score(clf.predict(X_test), y_test)

    if not headless_run:
        print('Variance score: {}'.format(variance_score))
        # print("CLF best: {}".format(clf.best_score_)) grid search only
        print('MSE score: {}'.format(MSEscore))
        print('MAE score: {}'.format(MAEscore))
        print('R2 score: {}'.format(R2score))

        # Plotting Residuals

        plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train,
                    color="green", s=10, label='Train data')

        plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test,
                    color="blue", s=10, label='Test data')

        plt.hlines(y=0, xmin=10, xmax=14, linewidth=2)

        plt.legend(loc='upper right')
        plt.title("Residual errors")
        plt.show()
    else:
        return [variance_score,MSEscore,MAEscore,R2score]
Esempio n. 12
0
def compute_estimates(zhat, w, y, regularize=True, nuisance=False):
    """Compute tau_dr, tau_ols, tau_ols_ps, tau_resid 
       on given confounders matrix and w and y."""
    tau_hat = dict()
    ps_hat, y0_hat, y1_hat = get_ps_y01_hat(zhat, w, y, regularize)
    tau_hat['tau_ols'] = tau_ols(zhat, w, y, regularize)
    tau_hat['tau_ols_ps'] = tau_ols_ps(zhat, w, y, regularize)
    tau_hat['tau_dr'] = tau_dr(y, w, y0_hat, y1_hat, ps_hat, regularize=regularize)
    if regularize:
        lr = RidgeCV(alphas=(0.1, 1.0, 10.0))
    else:
        lr = LinearRegression()
    lr.fit(zhat, y)
    y_hat = lr.predict(zhat)
    tau_hat['tau_resid'] = tau_residuals(y, w, y_hat, ps_hat, regularize=regularize)

    if nuisance:
        return tau_hat, {'ps_hat': ps_hat, 'y0_hat': y0_hat, 'y1_hat': y1_hat}
    return tau_hat
Esempio n. 13
0
def ridge_boston():
    boston = load_boston()
    x = boston.data
    y = boston.target
    train_x, test_x, train_y, test_y = \
        train_test_split(x, y, test_size=.25)
    std_s = StandardScaler()
    train_x = std_s.fit_transform(train_x)
    test_x = std_s.fit_transform(test_x)

    # ridge = Ridge(alpha=1.5)
    ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=4)
    ridge.fit(train_x, train_y)
    score = ridge.score(test_x, test_y)
    predict_y = ridge.predict(test_x)
    print(score)
    print(predict_y[:20])
    print(test_y[:20])
    return None
Esempio n. 14
0
def model_main_linear():
    ts_code = '399300.SZ'
    x_train, x_test, y_train, y_test = getdata(ts_code, type='linear')
    # return
    # print(x_train.shape)
    # print(x_train)
    # print(x_test.shape)
    # print(y_train.shape)
    # print(y_train)
    # print(y_test.shape)

    # 线性回归
    model = LinearRegression()
    # # 线性支持向量机 linearSVC
    # model = LinearSVC(C=0.01)

    model.fit(x_train, y_train)
    y_predictions = model.predict(x_test)
    r2 = r2_score(y_test, y_predictions)
    print('intercept:', model.intercept_)
    print('coef:', model.coef_)
    # print('y_test:\n', y_test)
    # print('y_predictions:\n', y_predictions)
    print('r2:', r2)

    print('ridge regression:')
    model_ridge = RidgeCV()
    model_ridge.fit(x_train, y_train)
    y_predictions_ridge = model_ridge.predict(x_test)
    r2_ridge = r2_score(y_test, y_predictions_ridge)
    print('intercept:', model_ridge.intercept_)
    print('coef:', model_ridge.coef_)
    print('r2:', r2_ridge)

    print('lasso regression:')
    model_lasso = LassoCV()
    model_lasso.fit(x_train, y_train)
    y_predictions_lasso = model_lasso.predict(x_test)
    r2_lasso = r2_score(y_test, y_predictions_lasso)
    print('intercept:', model_lasso.intercept_)
    print('coef:', model_lasso.coef_)
    print('r2:', r2_lasso)
Esempio n. 15
0
    def train(self, site, exclude_dates):
        """
        Fit the regression model for a site during for the specified window
        exclude_dates is a an optional set of datetime.date objects to exclude from training
        cli: pymortar client
        """
        start_train = pd.to_datetime('2016-01-01').tz_localize(
            'US/Pacific').isoformat()
        end_train = pd.to_datetime(
            datetime.datetime.today().date()).tz_localize(
                'US/Pacific').isoformat()
        alphas = [0.0001, .001, 0.01, 0.05, 0.1, 0.5, 1, 10]

        # Get data from pymortar
        data = get_df(site, start_train, end_train)

        # Get weekdays
        data['date'] = data.index.date
        weekdays = get_workdays(start_train, end_train)
        day_filter = [d in weekdays for d in data['date']]
        df = data[day_filter]

        # Exclude dates
        day_filter = [d not in exclude_dates for d in df.index.date]
        df = df[day_filter]

        # Create ridge features
        df = create_ridge_features(df)

        # Remove NA rows
        df = df.dropna()
        df = df[df['power'] != 0]

        # Train model
        X_train, y_train = df.drop(['power', 'weather', 'date'],
                                   axis=1), df['power']
        model = RidgeCV(normalize=True, alphas=alphas)
        model.fit(pd.DataFrame(X_train), y_train)

        # Train Error
        y_pred = model.predict(pd.DataFrame(X_train))
        self.model = model
Esempio n. 16
0
def ridge(X_train, X_test, y_train, y_test, y_scaler, train_num):
    """
    Ridge regression
    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :param y_scaler:
    :return:
    """
    # rig = RidgeCV(alphas=[1, 0.5, 0.1, 0.01, 0.05, 0.001, 0.005])
    rig = RidgeCV(alphas=[5.0, 10.0])
    rig.fit(X_train, y_train)
    y_pred = rig.predict(X_test)
    y_test = data.inverse_to_original_data(y_train.reshape(1, -1), y_test.reshape(1, -1), scaler=y_scaler,
                                           train_num=train_num)
    y_pred = data.inverse_to_original_data(y_train.reshape(1, -1), y_pred.reshape(1, -1), scaler=y_scaler,
                                           train_num=train_num)
    evaluate.all_metrics(y_test, y_pred)
    evaluate.draw_fitting_curve(y_test, y_pred, 0)
def ridgecv(x, y):
    """
    Function for Ridge with Cross Validation
    :param x: Attributes
    :param y: Target
    :return: MSE
    """
    reg = RidgeCV(cv=10).fit(x, y)
    predictions = reg.predict(x)
    mse = metrics.mean_squared_error(y, predictions)
    plt.figure(figsize=(15, 10))
    ft_importances_lm = pd.Series(reg.coef_[0], index=x.columns).sort_values()
    absolute_coefs = pd.Series(reg.coef_[0], index=x.columns)
    print(absolute_coefs.sort_values(ascending=False))
    ft_importances_lm.plot(kind='barh')
    plt.title("Ridge Coefficents \n Mean Squared Error = %f" % mse, fontsize=18)
    plt.xlim(-.6, .6)
    plt.show()
    print(reg.alpha_)
    return mse
Esempio n. 18
0
def linear_model3():
    """
    线性回归:岭回归
    """
    boston = load_boston()
    x_train, x_test, y_train, y_test = train_test_split(
        boston.data, boston.target, test_size=0.2)
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)
    #estimator = Ridge(alpha=1)
    estimator = RidgeCV(alphas=(0.01, 0.1, 1, 10, 100))
    estimator.fit(x_train, y_train)
    y_predict = estimator.predict(x_test)
    print("预测值为: \n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)
    return None
Esempio n. 19
0
class RidgeWithPost(BaseEstimator, TransformerMixin):
    def __init__(self, weight=1.0):
        self.ridge = RidgeCV(weight)

    def fit(self, X, y, sample_weight=None):
        self.ridge.fit(X, y)
        return self

    def predict(self, X):
        y = self.ridge.predict(X)
        ranged = np.empty(len(y))
        for i in range(0, len(y)):
            if y[i] < 18:
                ranged[i] = 18
            else:
                ranged[i] = y[i]
        return ranged

    def score(self, X, y, sample_weight=None):
        return self.ridge.score(X, y)
Esempio n. 20
0
class FlowModel(object):
    """Model selection & Xarray compatibility"""
    def __init__(self, kind, model_config):
        self.kind = kind
        if kind == 'neural_net':
            self.m = FlowModel_DNN(**model_config)
        elif kind == 'xgboost':
            self.m = XGBRegressor(**model_config)
        elif kind == 'Ridge':
            self.m = RidgeCV(**model_config)
        else:
            raise NotImplementedError(str(kind) + ' not defined')

    def fit(self, Xda, yda, **kwargs):
        return self.m.fit(Xda, yda, **kwargs)

    def predict(self, Xda, name=None):
        # use with xarray, return xarray
        a = self.m.predict(Xda.values).squeeze()
        return add_time(a, Xda.time, name=name)
Esempio n. 21
0
class UnawarePolicy:
    def __init__(self):
        self.F_reg = None

    def train(self, G, L, F):
        self.F_reg = RidgeCV().fit(np.hstack([G, L]), F)

    def evaluate(self, G, L, nb_seats=None):
        assert G.shape == L.shape
        nb_obs = G.shape[0]
        if nb_seats is None:
            nb_seats = nb_obs
        else:
            assert isinstance(nb_seats, int) and (nb_seats > 0)
            nb_seats = min(nb_obs, nb_seats)
        F_hat = self.F_reg.predict(np.hstack([G, L]))
        ind = F_hat.argsort(axis=0)[-nb_seats:][::-1]
        P = np.zeros([nb_obs, 1]).astype(bool)
        P[ind] = True
        return P
Esempio n. 22
0
    def predict(self, X):
        '''
        Override
        '''
        results = []

        X_ = np.array(X)

        # prediction for each observation
        for i in range(0, X_.shape[0]):
            X_actual = X_[i, :].reshape(1, -1)
            # X_i - X_actual
            X_disc = self._X_train - X_actual
            # ridge
            ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(
                X_disc, self._Y_train)
            # ridge predictions
            results.append(ridge.predict(X_actual)[0])

        return np.array(results).reshape(-1)
Esempio n. 23
0
class EegLinearRegression:
    # Initialise the ridge regression model.
    def __init__(self, folds=5, regularisation_bound=(0.1, 200)):
        # The regularisation parameters to select from.
        alphas = np.logspace(regularisation_bound[0],
                             regularisation_bound[1],
                             num=500)
        # Uses K-Fold CV here here. Does CV over alphas to get the best one.
        self.model = RidgeCV(cv=folds,
                             alphas=alphas,
                             scoring="neg_mean_squared_error")

    # Fit the LR model on the input data.
    def __train(self, x, y):
        trained_model = self.model.fit(x, y)
        # pickle.dump(trained_model,open('ridge_regression.sav','wb'))
        return trained_model

    # Test the LR model to get the predicted score for the given input data.
    def __test(self, x, y):
        predicted = self.model.predict(x)
        # return mean_squared_error(predicted, y)
        return explained_variance_score(predicted, y)

    # Calculate the nested cv score.
    def __nested_cv_score(self, x, y, outer_folds=5):
        x = np.array(x)
        outer_cv = KFold(n_splits=5, shuffle=True)
        scores = []
        for train_data_ind, test_data_ind in outer_cv.split(x):
            x_train, x_test = x[train_data_ind], x[test_data_ind]
            y_train, y_test = y[train_data_ind], y[test_data_ind]
            # Trains RidgeCV with cross validation.
            self.__train(x_train, y_train)
            best_score_for_fold = self.__test(x_test, y_test)
            scores += [best_score_for_fold]
        return np.average(np.array(scores))

    # Evaluate the LR model by giving an accuracy value. Will accept type of evaluation as well.
    def evaluate(self, x, y):
        return self.__nested_cv_score(x, y)
Esempio n. 24
0
 def ridge_cv(self, nsplits: int, lam: float = None):
     """
     runs a cross validation on the data set and returns the cross validation performance
     :param nsplits: number of cv splits
     :param lam: tuning parameter
     :return: the cross-validated mse
     """
     if lam is None:
         model = RidgeCV(cv=nsplits).fit(self.x, self.y)
         lam = model.alpha_
     cv = KFold(n_splits=nsplits)
     mse_result = []
     for train, test in cv.split(self.x):
         x_train = self.x[train, :]
         x_test = self.x[test, :]
         y_train = self.y[train]
         y_test = self.y[test]
         model = Ridge(alpha=lam).fit(x_train, y_train)
         y_predict = model.predict(x_test)
         mse_result.append(mse(y_test, y_predict))
     return np.mean(mse_result)
def ridge_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a ridge regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = RidgeCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
    print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Ridge picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Ridge Model")
        plt.show()
        # Visualizing the regression
        visualizer = ResidualsPlot(reg, size=(1080, 720))
        visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        visualizer.show()                 # Finalize and render the figure
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Ridge Regression",
        "R squared": reg.score(X_test, y_test),
        "R squared training": reg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }
Esempio n. 26
0
def Regression_scikit(N, deg, sigma, lamb, method='ls', stats=True):

    #poly = PolynomialFeatures(degree=deg)
    #XY = poly.fit_transform(xy)

    if method == 'ls':

        linreg = LinearRegression(fit_intercept=False)
        linreg.fit(XY, z)

        print("beta 0: ", linreg.intercept_)
        print("betas : ", linreg.coef_)

        zpredict = linreg.predict(XY)

    elif method == 'ridge':

        ridge = RidgeCV([float(lamb)])
        ridge.fit(XY, z)
        zpredict = ridge.predict(XY)
        print("beta 0: ", ridge.intercept_)
        print("betas : ", ridge.coef_)

    elif method == 'lasso':

        lasso = Lasso([float(lamb)])
        lasso.fit(XY, z)
        ypredict = lasso.predict(XY)
        print("beta 0: ", lasso.intercept_)
        print("betas : ", lasso.coef_)

    else:
        print(
            "Error: 'method' must be either 'ls', 'ridge', or 'lasso'. \nExiting..."
        )
        sys.exit(0)

    if stats == True:
        statistics(XY, z, zpredict, deg, lamb, method)
Esempio n. 27
0
def ridgeRegression(X, Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for tuned alpha in ridge regression
    """
    tuningAlpha = [0.1, 0.01, 0.001]

    # can change to model on the entire dataset but by convention splitting the dataset is a better option
    # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size = 0.10, random_state = 5)

    ridge = RidgeCV(normalize=True,
                    scoring='mean_squared_error',
                    alphas=tuningAlpha,
                    cv=10)
    ridge.fit(X, Y)
    prediction = ridge.predict(X)

    print "RIDGE REGRESSION"
    print "Best Alpha value for Ridge Regression : " + str(ridge.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(
        mean_squared_error(Y, prediction))
def regularization_m(X_re,y_re,predFeat=False):
    n_alphas=200
    alphas=np.logspace(1, 8, n_alphas)
    coefs=[]
    n=0
    for a in alphas:
        n+=1
        ridge=Ridge(alpha=a, fit_intercept=False)
        ridge.fit(X_re,y_re)
        coefs.append(ridge.coef_)
#    print(n,coefs)
    ax = plt.gca()
    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha')
    plt.ylabel('weights')
    plt.title('Ridge coefficients as a function of the regularization')
    plt.axis('tight')
    plt.show()   
        
    ridge=Ridge(alpha=28.6)  #Ridge预先确定a值
    ridge.fit(X_re,y_re)
    print(ridge.coef_,ridge.intercept_,ridge.alpha)
    
    redgecv=RidgeCV(alphas=alphas) #输入多个a值,模型自行择优选取
    redgecv.fit(X_re,y_re)
    print(redgecv.coef_,redgecv.intercept_,redgecv.alpha_)
    
    lasso=Lasso(alpha=0.01)
    lasso.fit(X_re,y_re)
    print(lasso.coef_,lasso.intercept_ ,lasso.alpha)
    
    elasticnet=ElasticNet(alpha=1.0,l1_ratio=0.5)
    elasticnet.fit(X_re,y_re)
    print(elasticnet.coef_,elasticnet.intercept_ ,elasticnet.alpha)
    
    if type(predFeat).__module__=='numpy':
        return redgecv.predict(predFeat)
Esempio n. 29
0
def train_ridge(x_train, x_valid, y_train, y_valid, classifier):
    # print('linear_model')
    preds = []
    if classifier == 'RidgeCV':
        clf = RidgeCV(
            alphas=[1, 0.1, 0.01,
                    0.001])  #Ridge regression with built-in cross-validation.
    if classifier == 'LassoCV':
        clf = LassoCV(alphas=[1, 0.1, 0.01, 0.001])
    if classifier == 'LR':
        clf = LinearRegression()
    if classifier == 'BAY':
        clf = BayesianRidge()
    if classifier == 'ElaNet':
        clf = ElasticNetCV(cv=5, random_state=0)
    if classifier == 'SVM':  # Linear Support Vector Regression, no better than chance
        clf = SVC(gamma='scale', tol=1e-5)
    if classifier == 'LinearSVC':
        clf = LinearSVC(tol=1e-10)
    if classifier == 'SGD':  # no better than chance
        clf = SGDClassifier(loss='log', max_iter=1000000, tol=1e-3)
    if classifier == 'RF':  # no better than chance
        clf = RandomForestClassifier(n_estimators=1000,
                                     max_depth=5,
                                     random_state=0)
    if classifier == 'LR':  # no better than chance
        clf = LogisticRegressionCV(cv=5,
                                   random_state=0,
                                   multi_class='multinomial')

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_valid)
    preds.append(y_pred.reshape(-1, 1))

    # preds = np.hstack(preds)
    # print(roc_auc_score(y_valid, preds.mean(1)))
    # print('roc_auc_score:',roc_auc_score(y_valid, y_pred))

    return clf
Esempio n. 30
0
class myStackingFeaturesRegressor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.estimator = None
        self.lgb = GradientBoostingRegressor(loss='ls',
                                             alpha=0.9,
                                             n_estimators=100,
                                             learning_rate=0.01,
                                             max_depth=8,
                                             subsample=0.8,
                                             min_samples_split=9,
                                             max_leaf_nodes=10)
        self.grd_enc = OneHotEncoder()
        self.lr = RidgeCV()
        self.classes_ = [-1, 1]

    def fit(self, X, y=None, **fit_params):
        self.lgb.fit(X, y)
        self.grd_enc.fit(self.lgb.apply(X))
        self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y)

    def predict(self, X):
        return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))
Esempio n. 31
0
def iris_model():
    # import data
    df = pd.read_csv(r"/home/airflow/gcs/dags/data/Iris.csv")

    # dummify
    df = pd.get_dummies(df, drop_first=True)

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        df[df.columns[df.columns != "PetalWidthCm"]],
        df["PetalWidthCm"],
        test_size=0.2)

    # cross validataion
    model = RidgeCV(alphas=np.logspace(-6, 6, 13), cv=3)
    model.fit(X_train, y_train)

    # prediction
    y_pred = model.predict(X_test)

    # appending results back
    X_test["Predicted"] = y_pred.round(1)
    X_test["Actual"] = y_test
    #rmse addition
    # rmse = sqrt(mean_squared_error(y_test, y_pred))
    # X_test["rmse"] = rmse

    # save output
    print('*_*_*_*_*_*_*__*_*_*')
    with open('/home/airflow/gcs/dags/data/iris_output_' +
              str(datetime.now().strftime("%d%m%Y%H%M%S")) + '.csv',
              'w',
              newline='') as fp:
        writer = csv.DictWriter(fp, fieldnames=X_test.columns)
        writer.writeheader()
        for row in X_test.to_dict('records'):
            writer.writerow(row)
    print(X_test)
class EncodingModel:
    def __init__(self, method='ridgecv', cv=None, alphas=None):
        self.method = method
        self.cv = cv  # default uses efficient gcv
        self.clf = None
        if alphas is not None:
            self.alphas = alphas
        else:
            self.alphas = [0.001, 0.01, 0.1]
            self.alphas.extend(np.linspace(1, 10, 10))

    def fit(self, X, y):
        if self.method == 'lr':
            self.clf = LinearRegression()
        elif self.method == 'ridgecv':
            self.clf = RidgeCV(cv=self.cv, alphas=self.alphas)
        else:
            raise Exception(f'method {self.method} not implemented')

        self.clf.fit(X, y)

    def predict(self, X):
        return (self.clf.predict(X))
Esempio n. 33
0
def impute_age(traintest):
    agetrain = traintest[pd.notnull(traintest['Age'])]
    agetest = traintest[pd.isnull(traintest['Age'])]
    columns = agetrain.columns.difference(['Age'])
    
    et = ExtraTreesRegressor(n_estimators=50)
    et.fit(agetrain[columns], agetrain.Age)    
    modelselector = SelectFromModel(et, prefit=True, threshold = 0.01)    
    Xtrainage = modelselector.transform(agetrain[columns])
    Xtestage = modelselector.transform(agetest[columns])
    
    knn = KNeighborsRegressor()
    ridge = RidgeCV(cv = 5)
    forest = ExtraTreesRegressor(n_estimators=50)
    
    ridge.fit(Xtrainage, agetrain.Age)
    forest.fit(Xtrainage, agetrain.Age)
    knn.fit(Xtrainage, agetrain.Age)
    
    missingAge1 = ridge.predict(Xtestage)
    missingAge2 = forest.predict(Xtestage)
    missingAge3 = knn.predict(Xtestage)
    missingAge = (missingAge1 + missingAge2 + missingAge3)/3
    return missingAge
Esempio n. 34
0
    def fitLakeLevels( self, flowData, lakeData, **kwargs ):
        # model lake levels from stream flows
        
        xTrain = self.setDelay( flowData, kwargs[ 'nDays' ] )

        flowScaler = preprocessing.StandardScaler().fit( xTrain )
        xTrain = flowScaler.transform( xTrain )
        self.flowScaler = flowScaler

        # fit to daily changes in elevation
        yTrain = lakeData - np.roll( lakeData, 1 )
        yTrain[ 0 ] = 0.


        if kwargs[ 'simpleModel' ]:
            model = RidgeCV( alphas = np.logspace( -2., 2. ) )
        else:
            model = ExtraTreesRegressor( n_estimators = 50, n_jobs = 4,
                                         random_state = 42 )
        

        model.fit( xTrain, yTrain )

        self.lakeModel = model

        ypreds = model.predict( xTrain )
        lakePreds = lakeData[ 0 ] + np.cumsum( ypreds )

        plt.clf()
        plt.plot( self.dates, yTrain + lakeData, label = 'Actual' )
        plt.plot( self.dates, lakePreds, label = 'Predicted' )

        plt.xlabel( 'Date' )
        plt.ylabel( 'Lake Travis Elevation (ft)' )
        plt.legend()
        plt.savefig( 'lakelevels.png' )
def scale_test_and_train_ridge(X, y):
    """
    Run a ridge regression on the model
    """
    X, X_test, y, y_test = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.25,
                                                      random_state=3)

    X_train_scale = X_train.values
    X_val_scale = X_val.values
    X_test_scale = X_test.values

    scale = StandardScaler()

    X_train_scale = scale.fit_transform(X_train_scale)
    X_test_scale = scale.transform(X_test_scale)
    X_val_scale = scale.transform(X_val_scale)

    ridge = RidgeCV(cv=5)
    ridge.fit(X_train_scale, y_train)

    ridge.score(X_train_scale, y_train)

    y_pred = ridge.predict(X_val_scale)

    print(f'Ridge Regression val R^2: {ridge.score(X_val_scale, y_val):.3f}')
    print(
        f'Ridge Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}'
    )

    return ridge.coef_
Esempio n. 36
0
    def predict(self, X):
        '''
        Override
        '''
        results = []

        X_ = np.array(X)

        # prediction for each observation
        for i in range(0, X_.shape[0]):
            X_actual = X_[i, :].reshape(1, -1)
            # we can calulate the coefficients for one row at a time
            actual_leaf_ids = self._extract_leaf_nodes_ids(X_actual)
            # calculate coefficients weights alpha_i(X_actual)
            alphas = self._get_forest_coefficients(actual_leaf_ids)
            # X_i - X_actual
            X_disc = self._X_train - X_actual
            # ridge
            ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(
                X_disc, self._Y_train, alphas)
            # ridge predictions
            results.append(ridge.predict(X_actual)[0])

        return np.array(results).reshape(-1)
Esempio n. 37
0
def ridgecv():
    data = init_data()
    X = data[:, 1:]
    #得到的是二维数组,每行代表一条数据,岭回归训练的时候X是一个二维数组,每行代表一条数据.Y是一个一位数组
    Y = data[:, 0]  #得到的是一维数组
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=42)
    # 利用train_test_split方法,将X,y随机划分问,训练集(X_train),训练集标签(X_test),测试卷(y_train)
    alphas = [0.03, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 1, 1.5, 2]
    ridgecv = RidgeCV(alphas, store_cv_values=True)
    ridgecv.fit(X_train, y_train)
    # #自己通过计算选择
    # smallest_idx=ridgecv.cv_values_.mean(axis=0).argmin()
    #要使用cv_values_需要在建立ridgecv指定store_cv_values=True
    # #在第0维度上进行平均,也就是计算不同的alpha下,在所有样本上的平均误差,最后形成12维数组,因为alpha可以取到12个
    # print("自己计算",alphas[smallest_idx])
    smallest_idx = ridgecv.cv_values_.mean(axis=0).argmin()
    f, ax = plt.subplots(figsize=(7, 5))
    ax.set_title(r"various values of a")
    xy = (alphas[smallest_idx], ridgecv.cv_values_.mean(axis=0)[smallest_idx])
    xytext = (xy[0] + .01, xy[1] + .1)
    ax.annotate(r'choose this a',
                xy=xy,
                xytext=xytext,
                arrowprops=dict(facecolor='black', shrink=0, width=0))
    #https://blog.csdn.net/qq_30638831/article/details/79938967
    ax.plot(alphas, ridgecv.cv_values_.mean(axis=0))
    plt.show()

    print("sklearn指定最优alpha值:", ridgecv.alpha_)
    print(ridgecv.coef_)
    print(ridgecv.intercept_)
    test_Y_pred = ridgecv.predict(X)
    print("测试集MSE:", mean_squared_error(Y, test_Y_pred))
     #else:
         #model = GPy.models.GPRegression(X_train, Y_train, kernel=k)
     icmk = GPy.util.multioutput.ICM(input_dim=X.shape[1], num_outputs=6, 
                                     kernel=k, W_rank=args.rank)
     model = GPy.models.GPCoregionalizedRegression(X_train_list,
                                                   Y_train_list,
                                                   kernel=icmk)
     model.optimize(messages=True, max_iters=100)
     print model
 
     # Get predictions
     info_dict = {}
     preds_list = []
     vars_list = []
     if args.model == 'ridge' or args.model == 'svr':
         preds = model.predict(X_test)
         if args.label_preproc == 'scale':
             preds = Y_scaler.inverse_transform(preds)
         elif args.label_preproc == 'warp':
             preds += 50
         info_dict['mae'] = MAE(preds, Y_test.flatten())
         info_dict['rmse'] = np.sqrt(MSE(preds, Y_test.flatten()))
         info_dict['pearsonr'] = pearsonr(preds, Y_test.flatten())
     else:
         # TODO: check if this makes sense
         #preds, vars = model.predict(X_test)
         #X_test_pred, Y_test_pred, index = GPy.util.multioutput.build_XY(X_test_list, Y_test_list)
         #noise_dict = {'output_index': X_test_pred[:,-1:].astype(int)}
         #preds, vars = model.predict_noiseless(X_test, Y_metadata=noise_dict)
         for emo_id, emo in enumerate(EMOS):
             # TODO: preprocessing
Esempio n. 39
0
trainingFeatures['network'] = predictions
predictions = net.activateOnDataset(testDs)
testingFeatures['network'] = predictions

#%%
trainingFeaturesPca, testingFeaturesPca = getPca(trainingFeatures, trainingTarget, testingFeatures, 3)

for col in trainingFeaturesPca.columns:
    trainingFeatures[col] = trainingFeaturesPca[col]
    testingFeatures[col] = testingFeaturesPca[col]

#%%
model = RidgeCV(alphas=[0.01, 1.0, 10.0])
model.fit(trainingFeatures, trainingTarget)

predictions = model.predict(trainingFeatures)
trainingFeatures['RidgeCV'] = predictions

predictions = model.predict(testingFeatures)
testingFeatures['RidgeCV'] = predictions

#%%
model = SGDRegressor()
model.fit(trainingFeatures, trainingTarget)

predictions = model.predict(trainingFeatures)
trainingFeatures['SGDRegressor'] = predictions
predictions = model.predict(testingFeatures)
testingFeatures['SGDRegressor'] = predictions

#%%
                            scoring='mean_absolute_error', cv=10)
ozone_ridgecv_reg = ozone_ridgecv_reg.fit(ozone_train.drop('ozone', axis=1),
                                          ozone_train['ozone'])

## Compare regularization models
print("Linear Coef: " + str(ozone_ln_reg.coef_)
      + "\nRidge Coef: " + str(ozone_ridge_reg.coef_) 
      + "\nLasso Coef: " + str(ozone_lasso_reg.coef_)
      + "\nCV Coef: " + str(ozone_ridgecv_reg.coef_)
      + "\nCV alpha: " + str(ozone_ridgecv_reg.alpha_))

# Predict using models and evaluate
ozone_ln_pred = ozone_ln_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_ridge_pred = ozone_ridge_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_lasso_pred = ozone_lasso_reg.predict(ozone_test.drop('ozone', axis=1))
ozone_ridgecv_pred = ozone_ridgecv_reg.predict(ozone_test.drop('ozone', axis=1))

## Calculate MAE, RMSE, and R-squared for all models
ozone_ln_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ln_pred)
ozone_ln_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_ln_pred))
ozone_ln_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ln_pred)

ozone_ridge_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ridge_pred)
ozone_ridge_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_ridge_pred))
ozone_ridge_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ridge_pred)

ozone_lasso_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_lasso_pred)
ozone_lasso_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_lasso_pred))
ozone_lasso_r2 = metrics.r2_score(ozone_test['ozone'], ozone_lasso_pred)

ozone_ridgecv_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ridgecv_pred)
Esempio n. 41
0
# temp_train are the intermediate training data, ie, outputs of the 3 level-0 learners, also inputs of the level-1 learner
temp_train = np.zeros((  len(Y2)   ,len(clfs)    ))
temp_test=np.zeros((  Xtest.shape[0]   ,len(clfs)    ))
for i, clf in enumerate(clfs):
    clf.fit(X1,Y1)                             # train each level-0 learner
    temp_train[:,i] = clf.predict(X2)          # intermediate data for level-1 learner given data X2 are generated
    temp_test[:,i] = clf.predict(Xtest)        # intermediate data for level-1 learner given data Xtest are also generated
    

# ====================== Training the level-1 learner  ===================
# level-1 learner
# cv = 5: 5 folds cross validation    
alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0]
stk = RidgeCV(alphas=alphas, normalize=True, cv=5).fit(temp_train, Y2)



# ====================== Prediction  ===================
# predict the test data and write output Y_hat to .csv file 
Y_hat = stk.predict(temp_test)
fh = open('n_50_predictions.csv','w')    # open file for upload
fh.write('ID,Prediction\n')         # output header line
for i,yi in enumerate(Y_hat):
  fh.write('{},{}\n'.format(i+1,yi)) # output each prediction
fh.close()  

print 'Writing finished!'



    model_elastic, train_x, y, scoring='neg_mean_squared_error',
    cv=kfold).mean()) + "\nTime of " + str(t_elastic))
pred_elastic = pd.DataFrame(
    data=np.expm1(model_elastic.predict(test_x)),  # values
    index=range(TRAIN_ROWS, TRAIN_COLS),  #Set Index
    columns=['SalePrice'])  # 1st column as index

#Use Ridge to Cross Validate and Model
t3 = time.time()
model_ridge = RidgeCV(alphas=alphas).fit(train_x, y)
t_ridge = time.time() - t1
print("\nRidge Score with CV: " + str(-1 * cross_val_score(
    model_ridge, train_x, y, scoring='neg_mean_squared_error',
    cv=kfold).mean()) + "\nTime of " + str(t_ridge))
pred_ridge = pd.DataFrame(
    data=np.expm1(model_ridge.predict(test_x)),  # values
    index=range(TRAIN_ROWS, TRAIN_COLS),  #Set Index
    columns=['SalePrice'])  # 1st column as index

#Use Random Forest to make estimator
from sklearn.ensemble import RandomForestRegressor
t4 = time.time()
clf = RandomForestRegressor().fit(train_x, np.asarray(y, dtype="|S6"))
t_rf = time.time() - t4
print("\nRandom Forest Score with CV: " +
      str(np.mean(cross_val_score(clf, train_x, y, cv=10))) + "\nTime of " +
      str(t_rf))
pred_rf = pd.DataFrame(
    data=np.expm1(clf.predict(test_x)),  # values
    index=range(TRAIN_ROWS, TRAIN_COLS),  #Set Index
    columns=['SalePrice'])  # 1st column as index
Esempio n. 43
0
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=1.0,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)
ridge_modelCV = RidgeCV(
    fit_intercept=True, alphas=[5.0],
    normalize=False, cv = 2, scoring='neg_mean_squared_error',
)
ridge_model.fit(X_train, Y_train)
ridge_modelCV.fit(X_train, Y_train)

Y_dev_preds_ridge = ridge_model.predict(X_dev)
Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge))

Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev)
Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1)
print("CV RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridgeCV))

ridge_preds = ridge_model.predict(X_test)
ridge_preds = np.expm1(ridge_preds)
ridgeCV_preds = ridge_modelCV.predict(X_test)
ridgeCV_preds = np.expm1(ridgeCV_preds)

def aggregate_predicts3(Y1, Y2, Y3, ratio1, ratio2):
    assert Y1.shape == Y2.shape
    return Y1 * ratio1 + Y2 * ratio2 + Y3 * (1.0 - ratio1-ratio2)

# Y_dev_preds = aggregate_predicts3(Y_dev_preds_rnn, Y_dev_preds_ridgeCV, Y_dev_preds_ridge, 0.4, 0.3)
# print("RMSL error for RNN + Ridge + RidgeCV on dev set:", rmsle(Y_dev, Y_dev_preds))
rows=[]
ys=[]
while rowindex<N:
  rowindex = rowindex+1;
  data =raw_input().split()
  feature = [float(data[0]),float(data[1])]
  #print np.vander(feature,5).flatten()
  rows.append(np.vander(feature,5).flatten())
  ys.append(float(data[-1]))

#print rows
ridge = RidgeCV(alphas=[0.1,1.0,10.0])
ridge.fit(rows,ys)

print ridge.alpha_
print ridge.coef_
print ridge.intercept_


predictNum = int(raw_input())
rowindex=0
rows=[]
while rowindex<predictNum:
  rowindex = rowindex+1;
  data =raw_input().split()
  feature = [float(data[0]),float(data[1])]
  rows.append(np.vander(feature,5).flatten())

for value in ridge.predict(rows):
    print value
Esempio n. 45
0
	for l in range(len(bvar)):
		ind=int(uni(0, 1)*len(bvar))
		ar.append(bvar[ind][1])
		ar1.append(bvar[ind][2])
		y.append(bvar[ind][0])
	#write as arrays, stack them 
	ar=np.array(ar); ar1=np.array(ar1); y=np.array(y)
	A=np.vstack([ar, ar1, np.ones(len(bvar))]).T
	
	#cross-validate the ridge regression 
	cl=RidgeCV(alphas=[0.5, 1.0, 50.0, 500.0])
	#cl=Ridge(alpha=1.0)
	cl.fit(A, y)
	#if cl.coef_[0]>=0:
	i+=1

	#arrays for predicted values and for the a, b, c coefficients	
	val_arr.append(cl.predict([32.21, 31.01, 1.]))
	coef_arr.append([cl.coef_[0], cl.coef_[1], cl.intercept_])

print 'The mean and standard deviation for this object is '
print np.std(val_arr), np.mean(val_arr)
coef_arr=np.array(coef_arr)
print "Coefficients of the ridge and their standard deviations "
print np.mean(coef_arr[:,0]), np.std(coef_arr[:,0]), np.mean(coef_arr[:,1]), np.std(coef_arr[:,1]), np.mean(coef_arr[:,2]), np.std(coef_arr[:,2])

#plot the coefficient arrays
plt.hist(coef_arr[:,1], alpha=0.3)
plt.hist(coef_arr[:,0], alpha=0.3)
plt.show()
Esempio n. 46
0
    time_taken = end_time - start_time
    print ("Time taken for pre-blending calculations: ", time_taken)

    print ("CV-Results", cv_results)
    
    # Start blending!    
    print ("Blending models.")

    alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0]
    
    bclf = RidgeCV(alphas=alphas, normalize=True, cv=5)
    bclf.fit(blend_train, Y_dev)       
    print ("Ridge Best alpha = ", bclf.alpha_)
   
    # Predict now
    Y_test_predict = bclf.predict(blend_test)
    
    if (DEVELOP):
            score1 = metrics.mean_absolute_error(Y_test, Y_test_predict)
            score = normalized_gini(Y_test, Y_test_predict)
            print ('Ridge MSE = %s normalized Gini = %s' % (score1, score))
    else: # Submit! and generate solution
        score = cv_results.mean()      
        print ('Avg. CV-Score = %s' % (score))
        #generate solution
        submission = pd.DataFrame({"Id": testidx, "cost": Y_test_predict})
        submission = submission.set_index('Id')
        submission.to_csv("bench_gen_stacking.csv") 


Esempio n. 47
0
# - **alphas:** array of alpha values to try

# create an array of alpha values
alpha_range = 10.**np.arange(-2, 3)
alpha_range


# select the best alpha with RidgeCV
from sklearn.linear_model import RidgeCV
ridgeregcv = RidgeCV(alphas=alpha_range, normalize=True, scoring='mean_squared_error')
ridgeregcv.fit(X_train, y_train)
ridgeregcv.alpha_


# predict method uses the best alpha value
y_pred = ridgeregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ### Lasso regression
# 
# - [Lasso](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) documentation
# - **alpha:** must be positive, increase for more regularization
# - **normalize:** scales the features (without using StandardScaler)

# try alpha=0.001 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_
Esempio n. 48
0
embs = util.load_embs(EMBS)
X = []

with open(INPUTS) as f:
    for line in f:
        X.append(util.preprocess_sent(line.split('_')[1]))

Y = np.loadtxt(LABELS)[:, 1]

###################
# PREPROCESS X
X  = np.array([util.average_sent(sent, embs) for sent in X])
#print X
#print X.shape

####################
# RIDGE
m = RidgeCV()
#m = KernelRidge(kernel='rbf')
#m = Ridge()
m.fit(X[:SPLIT], Y[:SPLIT])
preds = m.predict(X[SPLIT:])
Y_test = Y[SPLIT:]

for tup in zip(preds, Y_test)[:20]:
    print tup
print MAE(preds, Y_test)
print np.sqrt(MSE(preds, Y_test))
print pearsonr(preds, Y_test)

Esempio n. 49
0
# Save regressors
pickle_file = 'regressor.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'random_forest_regressor': rfr,
    'ridge': ridge,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

# Load regressor
pickle_file = 'regressor.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  rfr = save['random_forest_regressor']
  ridge = save['ridge']
  del save

# Predict test_data
y_pred_rfr = rfr.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
pd.DataFrame({'X1_Random_Forest': y_pred_rfr, 'X1_Ridge': y_pred_ridge}).to_csv('Results from Hang Yao.csv')

Esempio n. 50
0
train, test, train_ret, test_ret, train_stock, test_stock = \
    train_test_split(inst, ret, stock, test_size=0.4, random_state=1)

# SVR modeling
from sklearn.svm import SVR
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import RFE

rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
poly = SVR(kernel='poly', C=1e3, degree=2)
rig=RidgeCV()

rig.fit(train, train_ret)
rig.coef_
test_predict=rig.predict(test)
hits= ((test_ret>0) & (test_predict>0)) | ((test_ret<0) & (test_predict<0))
hit_ratio=1.0*sum(hits)/len(test_ret)


plt.figure(2)
plt.subplot(1,2,1)
plt.plot(test_ret, 'ko')
plt.plot(test_predict, 'ro')
plt.ylim([-1,1])
plt.xlim([0,len(test_ret)])
plt.plot([0,100],[0,0],'g--')
plt.xticks(range(1,len(test_ret)), test_stock, rotation='vertical')
plt.title('Actual and Predicted Returns')
plt.tick_params(axis='x', labelsize=5)
Esempio n. 51
0
    #     cv(model, X[:m], Y[:m])

    # cv(model, X, Y)
    print OKGREEN
    print "Done building models"
    print ENDC

    # -----------------------------------------------------------
    # Predictions

    print OKBLUE
    print "Making predictions"
    print ENDC

    clfP = clf.predict(clfTestX)
    linP = linreg.predict(newTestX)

    print OKGREEN
    print "Done making predictions"
    print ENDC

    # -----------------------------------------------------------
    # Analyze residuals
    print OKGREEN
    print "Analyzing residuals: "
    print "The following variables shall be defined."
    print "Indices in various array correspond to one another"
    print "------------------------------------------------------------------------------"
    print "msp_LinearRegression_i   : indices of mispredictions > 5"
    print "msp_LinearRegression_y   : mispredicted labels"
    print "msp_LinearRegression     : mispredictions"
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

###############################################################################
# At first, a linear model will be applied on the original targets. Due to the
# non-linearity, the model trained will not be precise during the
# prediction. Subsequently, a logarithmic function is used to linearize the
# targets, allowing better prediction even with a similar linear model as
# reported by the median absolute error (MAE).

f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

regr = RidgeCV()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

ax0.scatter(y_test, y_pred)
ax0.plot([0, 2000], [0, 2000], '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 2000])
ax0.set_ylim([0, 2000])

regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        func=np.log1p,
                                        inverse_func=np.expm1)
regr_trans.fit(X_train, y_train)
Esempio n. 53
0
X_pred = X[-predPeriod:]

X = X[:-predPeriod] #re-sizing the features for training
dataset.dropna(inplace=True) # get rid of naN for 'label' column

# create label 
y = np.array(dataset['label'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1)

# use linearRegression as algrithm
#clf = LinearRegression()
clf = RidgeCV (alphas =[0.1, 0.5, 1, 10])
clf.fit(X_train, y_train)
#start_time = time.time()
y_pred = clf.predict(X_pred)
#print time.time() - start_time
accuracy = clf.score(X_test, y_test)
# visualize Learning Curves
#ML.ModelLearning(X, y)
#ML.ModelComplexity(X_train, y_train)

#Linear slope calculation
#print clf.alpha_
#print clf
#print clf.coef_
#print clf.intercept_
print 'predict accuracy is: {:0.2f}'.format(accuracy)


# build a column in data for predict result
Esempio n. 54
0
def rigRegModel(X, T, y):
    ridge = RidgeCV(alphas=[0.1, 2.0, 5.0])
    ridge.fit(X, y)
    pre_y = ridge.predict(T)
    # print(pre_y)
    return pre_y
def analyze():
    global test
    global train
    global in_cols
    global target_col
    global ss
    global features
    global fields

    #ADD DATA FROM INPUT FIELDS TO TEST FILE
    addFeatures('C:/Users/Kwabena-Kobiri/Desktop/test.csv', features, fields)
    #ID_output['text'] = features[0]
    ID_output['text'] = features['ID']
    #Modules imported for Analysis
    import pandas as pd
    from sklearn.linear_model import RidgeCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score

    test = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/test.csv',
                       parse_dates=['date'])
    train = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/train.csv',
                        parse_dates=['date'])

    #ADD FEATURES DATAFRAME OBJECT TO THE TEST DATA SET
    #new_features.to_csv('test.csv', mode='a', header=False)

    #Data split for validation
    train_all = train.copy().dropna()
    train = train_all.loc[train_all.date < '2011-01-01']
    valid = train_all.loc[train_all.date > '2011-01-01']
    #print(train.shape, valid.shape)

    # Define input and output columns
    in_cols = train.columns[6:]
    target_col = 'burn_area'
    #in_cols

    #clear()

    # Get our X and y training and validation sets ready
    X_train, y_train = train[in_cols], train[target_col]
    X_valid, y_valid = valid[in_cols], valid[target_col]

    # Create and fit the model
    model = RidgeCV()
    model.fit(X_train, y_train)

    # Make predictions
    preds = model.predict(X_valid)
    #preds = model.predict(features[2:])

    # Score
    #mean_squared_error(y_valid, preds)**0.5 # RMSE - should match Zindi score. Lower is better

    #VISUALIZE SUBMISSION FILE
    ss = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmission.csv')
    #ss.head()

    # So we need to predict the burn area for each row in test.

    # Add the same features to test as we did to train:
    test['month'] = test.date.dt.month
    test['year'] = test.date.dt.year

    #donar_train['project_submitted_datetime'] = pd.to_datetime(donar_train.project_submitted_datetime, format='%d-%m-%Y %H:%M')

    # Get predictions
    preds = model.predict(
        test[in_cols].fillna(0)
    )  # fillna(0) here could be improved by examining the missing data and filling more appropriately.

    # Add to submission dataframe
    ss['Prediction'] = preds

    # View
    #ss.head()

    # Save ready for submission:
    ss.to_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmissionOG.csv',
              index=False)

    new = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmissionOG.csv')
    #prediction_output['text'] = round(new.Prediction[1], 5)
    prediction_output['text'] = round(new.Prediction[1], 5)
Esempio n. 56
0
ds = range(20150701, 20150732) + range(20150801, 20150831)
X3 = np.reshape(np.array(range(122, 184)), (-1, 1)).astype(np.int)
 
for song_id in song_ids:
    # Model training
    sql = "select ds, play_times from music_datas " + \
        "where song_id=='%s' and ds<'20150701'" % song_id
    cu2 = conn.cursor()
    cu2.execute(sql)
    ret = cu2.fetchall()
    X1, Y1 = generate_np_data(ret)
    # print X1, Y1
    clf.fit(X1, Y1)

    # Predict
    Y3 = clf.predict(X3).tolist()
    # break
    predicts = []
    for (x, y) in zip(ds, Y3):
        if y < 0:
            y = 0
        predicts.append((song_id[0], x, round(y)))
    cu2.executemany('insert into music_prediction values (?, ?, ?)', predicts)

    # process
    pro.ins().show()

print "alpha: %f" % clf.alpha_
conn.commit()
conn.close()
Esempio n. 57
0
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

###############################################################################
# At first, a linear model will be applied on the original targets. Due to the
# non-linearity, the model trained will not be precise during the
# prediction. Subsequently, a logarithmic function is used to linearize the
# targets, allowing better prediction even with a similar linear model as
# reported by the median absolute error (MAE).

f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

regr = RidgeCV()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

ax0.scatter(y_test, y_pred)
ax0.plot([0, 2000], [0, 2000], '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(
    100, 1750, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 2000])
ax0.set_ylim([0, 2000])

regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        func=np.log1p,
                                        inverse_func=np.expm1)
mask = np.logical_or(conditions == 'face', conditions == 'house')
fmri = fmri[mask]
session_id_fmri = session_id_fmri[mask]
conditions = conditions[mask]
"""

train_index = np.where(session_id_fmri != 6)
test_index = np.where(session_id_fmri == 6)
# Split into train and test sets
fmri_train, fmri_test = (fmri[train_index], fmri[test_index])
design_train, design_test = design[train_index], design[test_index]
stimuli_train, stimuli_test = stimuli[train_index], stimuli[test_index]

ridge = RidgeCV()
ridge.fit(fmri_train, design_train)
prediction = ridge.predict(fmri_test)
# ridge_coef = ridge.coef_[4]  # 'face' vs. 'house'
ridge_coef = -ridge.coef_[3] + ridge.coef_[4]  # 'face' vs. 'house'
coef_img = masker.inverse_transform(ridge_coef)
coef_map = coef_img.get_data()
threshold = np.percentile(np.abs(coef_map), 98)

# Plot stat map
plot_stat_map(coef_img, bg_img=haxby_dataset.anat[0],
              display_mode='z', cut_coords=[-5],
              title=model+" weights")
"""
# Plot time-series
onset = int(onsets[6][3]/tr)
time_series = prediction[onset + delay: onset + delay + tr, 1]
plt.plot(time_series)
Esempio n. 59
0
regr_svm = svm.SVR()
regr_rfr = RandomForestRegressor(n_estimators=10, random_state=None)
regr_knn = KNeighborsRegressor(n_neighbors=5)
regr_ridge = RidgeCV()

regr.fit(day_target_train, price_data_train)
regr_svm.fit(day_target_train, price_data_train)
regr_rfr.fit(day_target_train, price_data_train)
regr_knn.fit(day_target_train, price_data_train)
regr_ridge.fit(day_target_train, price_data_train)

prediction = regr.predict(day_target_test)
prediction_svm = regr_svm.predict(day_target_test)
prediction_rfr = regr_rfr.predict(day_target_test)
prediction_knn = regr_knn.predict(day_target_test)
prediction_ridge = regr_ridge.predict(day_target_test)

print(prediction)
print(prediction_svm)
print(prediction_rfr)
print(prediction_knn)
print(prediction_ridge)
print(price_data_test)

plt.plot(day_target_train, price_data_train, 'r')
plt.plot(day_target_test, price_data_test, 'g')
plt.plot(day_target_test, prediction_knn, 'c')
plt.plot(day_target_test, prediction_svm, 'b')
plt.plot(day_target_test, prediction, 'k')
plt.plot(day_target_test, prediction_ridge, 'y')
plt.plot(day_target_test, prediction_rfr, 'y')
names =["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",\
    "RAD","TAX","PTRATIO","B","STAT"]

features = get_features_matrix(np.loadtxt('../housing_train.dat'))

labels = np.loadtxt('../housing_prices_train.dat')
alphas = 10**np.linspace(-5, -1, 100)

model = RidgeCV(normalize=True, store_cv_values=True, alphas=alphas)
model.fit(features, labels)
print model.coef_
print model.alpha_
cv_errors = np.mean(model.cv_values_, axis=0)

# Test the model
features_test = get_features_matrix(np.loadtxt('../housing_test.dat'))
prices_test = np.loadtxt('../housing_prices_test.dat')
prices_pred = model.predict(features_test)
score = np.mean((prices_test-prices_pred)**2)
print 'Score', score

pl.semilogx(alphas, cv_errors)

pl.figure()
ax = pl.subplot(111)
pl.plot(range(len(model.coef_)), model.coef_, 'o')
#ax.set_xticklabels(names)
#ax.set_xticks(range(len(model.coef_)))
pl.show()