def ridge_predict(train_data, train_target, test_data):

	# Prep modeller
	alpha_ranges = [1e-3, 1e-2, 1e-1, 1, 1e2, 1e3,
					2e3, 2.5e3, 3e3, 3.5e3, 4e3, 
					5e3, 6e3, 6.1e3, 6.15e3, 6.25e3, 6.3e3, 6.4e3, 7e3, 
					7.75e3, 7.9e3, 8e3, 8.1e3, 8.2e3, 8.25e3, 8.3e3, 8.4e3, 8.5e3, 8.75e3, 9e3, 9.25e3, 9.4e3, 9.5e3, 9.6e3, 9.75e3,
					1e4, 1.25e4, 1.4e4, 1.5e4, 1.55e4, 1.58e4, 1.6e4, 1.625e4, 1.65e4, 1.7e4, 1.725e4, 1.74e4, 1.75e4, 1.76e4, 1.78e4, 1.85e4, 
					2e4, 2.25e4, 2.5e4, 3e4, 4e4,  
					0.5e5, 0.75e5, 1e5, 1.25e5, 1.5e5, 
					0.8e6, 0.9e6, 1e6, 1.1e6, 1.2e6, 1.25e6, 1.28e6, 1.3e6, 1.32e6, 1.33e6, 1.34e6, 1.4e6, 1.5e6, 2e6,
					1e7, 1e8, 1e9, 5e9, 1e10, 5e10, 1e11, 1e12, 1e13]
	clf = RidgeCV(alphas=alpha_ranges, 
              normalize=True, cv=None, fit_intercept=False, store_cv_values=True)

	# Fit
	clf.fit(train_data, train_target)
	# print("alpha range:", alpha_ranges)
	# print("CV per alpha:",np.mean(clf.cv_values_, axis=0))
	# print("alpha used:", clf.alpha_)
	# print("fit score:", clf.score(train_data, train_target))

	# Prediction
	predictions = clf.predict(test_data)

	return predictions
def RR_cv_estimate_alpha(sspacing, tspacing, alphas):
    """
    Estimate the optimal regularization parameter using grid search from a list
    and via k-fold cross validation

    Parameters
    ----------
    sspacing : 2D subsampling ratio in space (in one direction)

    tspacing : 1D subsampling ratio in time

    alphas : list of regularization parameters to do grid search
    
    """
    #Load all training data
    (Xl_tr, mea_l, sig_l, Xh_tr,mea_h,sig_h) =  data_preprocess(sspacing, tspacing)  
    
    # RidgeCV
    from sklearn.linear_model import RidgeCV    
    ridge = RidgeCV(alphas = alphas, cv = 10, fit_intercept=False, normalize=False)
    ridge.fit(Xl_tr, Xh_tr)
    
    RR_alpha_opt = ridge.alpha_
    
    print('\n Optimal lambda:', RR_alpha_opt)
    
    # save to .mat file
    import scipy.io as io
    filename = "".join(['/data/PhDworks/isotropic/regerssion/RR_cv_alpha_sspacing',
                        str(sspacing),'_tspacing',str(tspacing),'.mat'])
    io.savemat(filename, dict(alphas=alphas, RR_alpha_opt=RR_alpha_opt))
    
    # return
    return RR_alpha_opt
Exemple #3
0
def Ridge_model(train_linear, test_linear):
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(train_linear_fea, train_linear_tar)
    ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    #ridge.set_params(alpha=6,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl')
    return test_prediction_ridge
    
    
 def fit(self, X, y):
     """Fit the shape function of each features with the backfitting algorithm.
     Please note that the shape functions are centered (not reduced).
     
     Parameters
     ----------
     X : array-like, shape=(n_samples, n_features)
         The input samples. 
         
     Returns
     -------
     self : object
         The Generalized Additive Model with the fitted shape functions
     """
     
     n_samples, n_features = X.shape
     
     if not isinstance(self.smoothers, list):
         self.smoothers_ = [clone(self.smoothers) for i in range(n_features) ]
         self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False)
     else:
         self.smoothers_ = [clone(self.smoothers[j]) for j in range(n_features) ]
         self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False)
         
     self.y_mean_ = np.mean(y)
     self.rmse_ = [] # array to stock the train error over the iteration
     y -= y.mean()
     temp = np.zeros(shape=(n_samples, n_features)) # array to stock the shape function for re-use in the next iteration
     shape_functions = np.zeros(shape=(n_samples, n_features))
     for i in range(self.max_iter):
         for j in range(n_features):
             # select all the columns except the j-th one
             idx = list(set(np.arange(0, n_features, 1)) - set([j])) 
             
             #Compute the residuals of the previous iteration          
             residuals = y.reshape((n_samples,1)) - temp[:, idx].sum(axis=1, keepdims=True).reshape((n_samples, 1)) 
             residuals -=residuals.mean()
             residuals = residuals
             #print(np.amin(residuals), np.amax(residuals), 'iteration number %s'%(i+1))
            
             self.smoothers_[j].fit(X[:, j:j+1], residuals.reshape((n_samples,))) #reshape cause deprecation warning
             shape_functions[:, j]= self.smoothers_[j].predict(X[:, j:j+1])
             shape_functions[:, j] -= shape_functions[:, j].mean()
         
         # RidgeRegression on top of the shape function in order to 're-scale' each shape functions
         self.ridge.fit(shape_functions, y)
         coef = self.ridge.coef_
         shape_functions *= coef
         
         y_pred = shape_functions.sum(axis=1)
         y_pred -= y_pred.mean()
         self.rmse_.append(met.mean_squared_error(y_pred, y))
         
         temp=shape_functions.copy()
         #plt.scatter(1, np.abs(residuals.min()), c='g', label='iteration = %s'%i)
         #plt.scatter(2, np.abs(residuals.max()), c='r')
         #plt.legend()
         #plt.show()
     return self
Exemple #5
0
def regularizedreg(Xtrain,Xtest,ytrain,ytest):
    Rclf = RidgeCV(alphas=[1,2,20,40,50]) # RidgeCV(alphas=[0.1, 1.0, 2.0, 4.0, 20.0], cv=None, fit_intercept=True, scoring=None, normalize=False)
    Rclf.fit(Xtrain,ytrain);
    print("Residual sum of squares: %.2f"
         % np.mean((Rclf.predict(Xtest) - ytest) ** 2))
    print('Regularization choosen, alpha = %.2f' % Rclf.alpha_);
    print(' Coef values = ', Rclf.coef_);                                      
    print('Variance score: %.2f' % Rclf.score(Xtest, ytest))
def ridgeCV(data, targets):
    """
    Returns a RidgeCV linear model for predictions with alphas [1, 10, 50, 100, 1000]
    Takes the data and the associated targets as arguments.
    """
    model = RidgeCV(alphas=[1, 10, 50, 100, 1000])
    model.fit(data, targets)
    return model
def fit_Ridge(features_train, labels_train, features_pred, alphas=(0.1, 1.0, 10.0)):
	model = RidgeCV(normalize=True, store_cv_values=True, alphas=alphas)
	model.fit(features_train, labels_train)
	cv_errors = np.mean(model.cv_values_, axis=0)
	print "RIDGE - CV error min: ", np.min(cv_errors)	
	# Test the model
	labels_pred = model.predict(features_pred)
	return labels_pred
Exemple #8
0
def orth_signal(x, atol=1e-13, rtol=0):
    """
    Returns signal orthogonal to input ensemble.
    x -> input singal [n_samples, n_neurons]
    """
    t = np.linspace(0, 1, x.shape[0])[:, None]
    f = arange(x.shape[1]) / x.shape[1]
    xt = np.sum(sin(2 * np.pi * f * 3 * t) / (f + 1), axis=1)
    w = RidgeCV(np.logspace(-6, 3, 50))
    w.fit(x, xt)
    xt = xt - w.predict(x)
    # pdb.set_trace()
    return xt
Exemple #9
0
def RidgeCVLinear(train,test):
  print('starting RidgeCVLinear ...')
  ridge=RidgeCV(normalize=True,cv=5)
  train.reindex(np.random.permutation(train.index))
  tr_X=train.drop('LogSales',axis=1)
  tr_Y=train['LogSales']
  cutoff=math.floor(0.7*tr_Y.size)
  ridge.fit(tr_X[:cutoff],tr_Y[:cutoff])
  predY=ridge.predict(tr_X[cutoff:])
  mspe=rmspe(predY,tr_Y[cutoff:])
  print('rmspe is %9f'% mspe)
  print(train.columns)
  print(ridge.coef_)
  print('starting RidgeCVLinear ... completed')
  return ridge
    def __init__(self, num_dists=2, sigma=0.1, base_learner=None, **kwargs):
        self.num_dists = num_dists
        self.sigma = sigma
        
        if base_learner is None:
            base_learner = RidgeCV(fit_intercept=False, \
                    alphas=[0.001, 0.01, 0.1, 100, 1000], cv=None,
                    store_cv_values=True)
        
        if 'fit_intercept' not in kwargs:
            kwargs['fit_intercept'] = False

        self.base_learner = base_learner.set_params(**kwargs)
        self.R = None
        self.model = None
Exemple #11
0
def stacking(estimators):
    # training
    predictions = []
    for estim in estimators:
        estim.fit(X, y)
        predictions.append(estim.predict(X))

    agg = RidgeCV(alphas=alphas, cv=5, normalize=True, fit_intercept=True)         # aggregator
    agg.fit(np.array(predictions).T, y)

    # test
    predictions = []
    for estim in estimators:
        predictions.append(estim.predict(test_data))

    predictions = agg.predict(np.array(predictions).T)
    write_results(predictions)
def validate(nPrev, nAfter, aux_temp, aux_sun, aux_prec, get_model=False):
    X_Final = getFeature(nPrev, nAfter, aux_temp, aux_sun, aux_prec, TrainFiles)
    data_train_target = pd.read_csv(TrainTarget, sep='\t', header=None)
    y = data_train_target.loc[:,0].values

    TEST_SIZE = 0.2
    RANDOM_STATE = 0
    X_train, X_val, y_train, y_val = train_test_split(X_Final, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    imp.fit(X_train)
    X_train = imp.transform(X_train)
    imp.fit(X_val)
    X_val = imp.transform(X_val)

    reg = RidgeCV()
    reg.fit(X_train, y_train)
    y_val_pred = reg.predict(X_val)
    print mean_squared_error(y_val, y_val_pred)
    
    if get_model:
        imp.fit(X_Final)
        X_Final = imp.transform(X_Final)
        reg_submit = RidgeCV()
        reg_submit.fit(X_Final, y)
        return reg_submit
    return mean_squared_error(y_val, y_val_pred)
def build(path):
    """
    Computes a linear regression using Ridge regularization.
    """
    print "Building the linear model using Ridge regression"
    start = time.time()

    # Load the data, the target is the last column.
    data  = np.loadtxt(path, delimiter=',')
    y = data[:,-1]
    X = data[:,0:-1]

    # Instantiate and fit the model.
    model = RidgeCV()
    model.fit(X, y)

    print "Finished training the linear model in {:0.3f} seconds".format(time.time() - start)
    return model
def ridgeRegression(X,Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for tuned alpha in ridge regression
    """
    tuningAlpha = [0.1,0.01,0.001]

   # can change to model on the entire dataset but by convention splitting the dataset is a better option
   # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size = 0.10, random_state = 5)

    ridge = RidgeCV(normalize=True,scoring='mean_squared_error', alphas=tuningAlpha, cv=10)
    ridge.fit(X, Y)
    prediction = ridge.predict(X)

    print "RIDGE REGRESSION"
    print "Best Alpha value for Ridge Regression : " + str(ridge.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
 def fit_mapping(self):
     """
     Fits the mappings from one distributions to the other
     """
     X1 = self.X1
     n1, p1 = X1.shape
     X2 = self.X2
     n2, p2 = X2.shape
     P = self.P
     c = self.c
     r = self.r
     reg_mapping = self.reg_mapping
     # mapping from X1 to X2
     self.model1to2 = RidgeCV(alphas=np.logspace(-3, 3, 7))
     self.model1to2.fit(X1, (P * c.reshape((-1, 1))) @ X2)
     # mapping from X2 to X1
     self.model2to1 = RidgeCV(alphas=np.logspace(-3, 3, 7))
     self.model2to1.fit(X2, (P.T * r.reshape((-1, 1))) @ X2)
    def map_vector_spaces(self):
        """
        Perform linear regression upon the semantic embeddings.

        - Semantic embeddings obtained from vector space of corresponding
            bilingual words of the same language.
        """
        self.logger.info('Learning transformation between Vector Spaces.')
        self.lt = RidgeCV()
        self.lt.fit(self.vector_1_list, self.vector_2_list)
def regression(x, y):
  #enet = MultiTaskElasticNetCV(l1_ratio=0.2)
  enet = RidgeCV()
  y_pred_enet = enet.fit(x, y)

  word_vals = pd.DataFrame(columns = ['coeff'])
  counter = 0
  for i in y_pred_enet.coef_[0]:
    word_vals.loc[x.columns.values[counter]] = i
    counter += 1

  predicted_vals = y_pred_enet.predict(x)
  predicted_df = pd.DataFrame(columns = ['comment','predicted'])
  predicted_df.set_index(['comment'], inplace = True)
  counter = 0
  for i in y.index.values:
    predicted_df.loc[i, 'predicted'] = predicted_vals[counter][0]
    counter += 1

  return word_vals, predicted_df
Exemple #18
0
    def fitFlowRates( self, rainData, flowData, **kwargs ):
        # model stream flows from rainfall rates

        xTrain = self.setDelay( rainData, kwargs[ 'nDays' ] )
        yTrain = flowData

        # perform feature scaling
        weatherScaler = preprocessing.StandardScaler().fit( xTrain )
        xTrain = weatherScaler.transform( xTrain )
        self.weatherScaler = weatherScaler

        if kwargs[ 'simpleModel' ]:
            model = RidgeCV( alphas = np.logspace( -2., 2. ) )
        else:
            model = ExtraTreesRegressor( n_estimators = 50, n_jobs = 4,
                                         random_state = 42 )
            
        model.fit( xTrain, yTrain )

        self.flowModel = model
Exemple #19
0
    def transform(self, X):

        if len(X.shape) == 1:
            X = np.atleast_2d(X).T

        H = self.H[self.n_washout:,:]
        yy = self.X[self.n_washout:,:]

        ## if regularization parameter is None, then determine by cross validation
        if self.lamb is None:
            ## proposals for regularization parameters
            lamb_all = [0.1, 1., 10.]
            ## initialize Ridge Regression classifier
            rr_clf = RidgeCV(alphas=lamb_all)
            ## fit the data with the linear model
            rr_clf.fit(H, yy)
            ## regularization parameter determined by cross validation
            self.lamb = rr_clf.alpha_

        else:
            rr_clf = Ridge(alpha=self.lamb)
            rr_clf.fit(H,yy)

        ## best-fit output weights
        self.ww = rr_clf.coef_

        ## store activations for future use

        return self.ww
Exemple #20
0
    def fitLakeLevels( self, flowData, lakeData, **kwargs ):
        # model lake levels from stream flows
        
        xTrain = self.setDelay( flowData, kwargs[ 'nDays' ] )

        flowScaler = preprocessing.StandardScaler().fit( xTrain )
        xTrain = flowScaler.transform( xTrain )
        self.flowScaler = flowScaler

        # fit to daily changes in elevation
        yTrain = lakeData - np.roll( lakeData, 1 )
        yTrain[ 0 ] = 0.


        if kwargs[ 'simpleModel' ]:
            model = RidgeCV( alphas = np.logspace( -2., 2. ) )
        else:
            model = ExtraTreesRegressor( n_estimators = 50, n_jobs = 4,
                                         random_state = 42 )
        

        model.fit( xTrain, yTrain )

        self.lakeModel = model

        ypreds = model.predict( xTrain )
        lakePreds = lakeData[ 0 ] + np.cumsum( ypreds )

        plt.clf()
        plt.plot( self.dates, yTrain + lakeData, label = 'Actual' )
        plt.plot( self.dates, lakePreds, label = 'Predicted' )

        plt.xlabel( 'Date' )
        plt.ylabel( 'Lake Travis Elevation (ft)' )
        plt.legend()
        plt.savefig( 'lakelevels.png' )
    def transform(self, X):

        ## make sure data is in correct form (N_samples, N_dimensions)
        if len(X.shape) == 1:
            X = np.atleast_2d(X).T

        ## store data in attribute
        self.X = X

        ## number of data points
        self.K = int(self.X.shape[0])

        ## number of dimensions
        self.D = int(self.X.shape[1])


        ## filter windows
        H = np.zeros((self.K-self.k, self.k))

        for i in xrange(self.k,self.K-1,1):
            H[i-self.k,:] = X[i-self.k:i,0]


        self.H = H

        #print(self.k)
        if len(X.shape) == 1:
            X = np.atleast_2d(X).T

        H = self.H
        yy = X[self.k:]


        if self.lamb is None:
            ## proposals for regularization parameters
            lamb_all = [0.1, 1., 10.]
            ## initialize Ridge Regression classifier
            rr_clf = RidgeCV(alphas=lamb_all)
            ## fit the data with the linear model
            #print(H.shape)
            #print(yy.shape)
            rr_clf.fit(H, yy)
            ## regularization parameter determined by cross validation
            self.lamb = rr_clf.alpha_

        else:
            rr_clf = Ridge(alpha=self.lamb)
            rr_clf.fit(H,yy)

        ## best-fit output weights
        self.ww = rr_clf.coef_

        ## store activations for future use

        return self.ww
def learn_models(df, features, label_in, label_out):
    model_in = RidgeCV(scoring="r2")
    model_in.fit(df[features], df[label_in])

    model_out = RidgeCV(scoring="r2")
    model_out.fit(df[features], df[label_out])

    with open('model_in.pkl', 'wb') as fid:
        cPickle.dump(model_in, fid)

    with open('model_out.pkl', 'wb') as fid:
        cPickle.dump(model_out, fid)
Y = data['lpsa']


def Standard_error(sample):
    std = np.std(sample, ddof=0)
    standard_error = std / math.sqrt(len(sample))
    return standard_error


X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1)  #,test_size=0.2, random_state=1)

# set the final alpha by using RidgeCV
Lambdas = np.logspace(-5, 2, 200)
ridge_cv = RidgeCV(alphas=Lambdas,
                   normalize=True,
                   scoring='neg_mean_squared_error',
                   cv=10)
ridge_cv.fit(X_train, y_train)
print('Alpha is:' + str(round(ridge_cv.alpha_, 4)))
ridge = Ridge(alpha=ridge_cv.alpha_)

# predict
ridge.fit(X_train, y_train)
y_predict = ridge.predict(X)
y_test_predict = ridge.predict(X_test)

# model evaluation (MSE,MAE,std_error)
mse_predict = round(mean_squared_error(y_test, y_test_predict), 4)
mae_predict = round(mean_absolute_error(y_test, y_test_predict), 4)
std_error = round(Standard_error(y_test_predict), 4)
names = [
    "fixed acidity", "volatile acidity", "citric acid", "residual sugar",
    "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density",
    "pH", "sulphates", "alcohol", "type"
]
x = datas[names]
y = datas['quality']
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.01,
                                                    random_state=0)

models = [
    Pipeline([('Poly', PolynomialFeatures()), ('Linear', LinearRegression())]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear', RidgeCV(alphas=np.logspace(-4, 2, 20)))]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear', LassoCV(alphas=np.logspace(-4, 2, 20)))]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear',
               ElasticNetCV(alphas=np.logspace(-4, 2, 20),
                            l1_ratio=np.linspace(0, 1, 5)))])
]

l = np.arange(len(x_test))
pool = np.arange(1, 4, 1)
colors = []
for c in np.linspace(5570560, 255, len(pool)):
    colors.append('#%06x' % int(c))

plt.figure(figsize=(16, 8), facecolor='w')
Exemple #25
0
        [   ## 线性回归
            ('Poly',PolynomialFeatures()),
            ('Linear',LinearRegression())
        ]
    ),
    Pipeline(
        [   ## Lasso回归
            ('Poly',PolynomialFeatures()),
            ('Linear',LassoCV(alphas=np.logspace(-3,1,20)))
        ]
    ),
    Pipeline(
        [
            ## Ridge回归
            ('Poly', PolynomialFeatures()),
            ('Linear', RidgeCV(alphas=np.logspace(-3,1,10)))
        ]
    ),
    Pipeline(
        [
            ## ElasticNet回归:l1_ratio -> L1-norm占比,alphas为超参
            ('Poly', PolynomialFeatures()),
            ('Linear', ElasticNetCV(l1_ratio=np.logspace(-3,1,10),alphas=np.logspace(-3,1,10)))
        ]
    )
]
## 设置模型参数
paramters = {
    "Poly__degree":[3,2,1,0],
    "Poly__interaction_only":[False,False,False,False],
    "Poly__include_bias":[True,False,True,True],
Exemple #26
0
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(
        model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)


# setup models
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [
    5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008
]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(), RidgeCV(
    alphas=alphas_alt,
    cv=kfolds,
))

lasso = make_pipeline(
    RobustScaler(),
    LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))

elasticnet = make_pipeline(
    RobustScaler(),
    ElasticNetCV(max_iter=1e7,
                 alphas=e_alphas,
                 cv=kfolds,
                 random_state=42,
                 l1_ratio=e_l1ratio))

svr = make_pipeline(RobustScaler(), SVR(
    [HuberRegressor()],
    [Lars()],
    [LarsCV(max_n_alphas=10)],
    [Lasso(random_state=42)],
    [LassoCV(n_alphas=10)],
    [LassoLars(alpha=0.1)],
    [LassoLarsCV(max_n_alphas=10)],
    [LassoLarsIC()],
    [LinearRegression()],
    [LinearRegression(fit_intercept=False)],
    [LinearSVR(random_state=42)],
    [OrthogonalMatchingPursuit(n_nonzero_coefs=10)],
    [OrthogonalMatchingPursuitCV()],
    [PassiveAggressiveRegressor(C=0.1)],
    [Ridge(random_state=42)],
    [RidgeCV()],
    [SGDRegressor(**SGD_KWARGS)],
    [TheilSenRegressor()],
    [SVR(kernel='linear')],
    [NuSVR(kernel='linear')],
])
def test_explain_linear_regression(boston_train, reg):
    assert_linear_regression_explained(boston_train, reg, explain_prediction)


@pytest.mark.parametrize(['reg'], [
    [SVR()],
    [NuSVR()],
])
def test_explain_libsvm_linear_regressors_unsupported_kernels(reg, boston_train):
    X, y, feature_names = boston_train
Exemple #28
0
model3 = RandomForestRegressor(n_estimators=1000,
                               criterion='mse',
                               max_depth=3,
                               n_jobs=4,
                               random_state=0,
                               verbose=0)
model4 = LinearRegression(fit_intercept=True, n_jobs=4)
model5 = LassoCV(eps=0.001,
                 n_alphas=200,
                 cv=5,
                 fit_intercept=True,
                 max_iter=1000,
                 tol=1e-4,
                 random_state=0)  # 200个alpha中选择一个最优的
model6 = RidgeCV(alphas=tuple(np.linspace(0, 1, 200)),
                 fit_intercept=True,
                 cv=5)

models = [model00, model0, model1, model2, model3, model4, model5, model6]
names = [
    'CatBoost',
    'LightGBM',
    'XGBOOST',
    'GBDT',
    'RF',
    'LR',
    'Lasso',
    'Ridge',
]

for name, model in zip(names, models):
Exemple #29
0
ytest = y[istest]
ntest = ytest.size

#X = load_mat('prostate')
# Hack to use the correct dataset.
#X['Xtest'][8][1] = 3.804438
# Rescale all data at once.
#Xscaled = _scale(np.append(X['Xtrain'], X['Xtest'], axis=0))
#Xtrain = Xscaled[0:67,:]
#Xtest = Xscaled[67:,:]
#ytrain = X['ytrain']
#ytest = X['ytest']

### Process data

methods = [LinearRegression(), RidgeCV(cv=3), LassoCV()]
method_names = ["LS", "Ridge", "Lasso"]

# Hash table to store parameters and performance, indexed by method name
coefHt = {}
mseHt = {}
stderrHt = {}

for i, method in enumerate(methods):
    name = method_names[i]
    clf = method
    model = clf.fit(Xtrain, ytrain.ravel())
    coef = np.append(model.intercept_, model.coef_)
    coefHt[name] = coef
    yhat = model.predict(Xtest)
    #mse = mean_squared_error(yhat, ytest)
Exemple #30
0
X_train = X[:n_trains]
Y_train = train_df.target.values.reshape(-1, 1)

X_dev = X[n_trains:n_trains+n_devs]
Y_dev = dev_df.target.values.reshape(-1, 1)

X_test = X[n_trains+n_devs:]
print(X.shape, X_train.shape, X_dev.shape, X_test.shape)

print("Fitting Ridge model on training examples...")
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=1.0,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)
ridge_modelCV = RidgeCV(
    fit_intercept=True, alphas=[5.0],
    normalize=False, cv = 2, scoring='neg_mean_squared_error',
)
ridge_model.fit(X_train, Y_train)
ridge_modelCV.fit(X_train, Y_train)

Y_dev_preds_ridge = ridge_model.predict(X_dev)
Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge))

Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev)
Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1)
print("CV RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridgeCV))

ridge_preds = ridge_model.predict(X_test)
ridge_preds = np.expm1(ridge_preds)
ridgeCV_preds = ridge_modelCV.predict(X_test)
Exemple #31
0
 for yi in yIdx:
     print(yDescr[cnt])
     cnt += 1
     
     
     y = yAll.iloc[:,yi]
     # log transform
     y = np.log(y+1)
     
     
     
             
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
     
     # ridge
     regRidge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, y_train)
     print('ridge: ', regRidge.score(X_test, y_test))
     
     # lasso
     regLasso = LassoCV(cv=5, random_state=0).fit(X_train, y_train.ravel())
     print('lasso: ', regLasso.score(X_test, y_test))
     
     # forest
     regForest = RandomForestRegressor(random_state=0, max_features='sqrt', oob_score=True, min_samples_leaf=0.05).fit(X_train, y_train.ravel())
     print('random forest: ', regForest.score(X_test, y_test))
     
     # elastic net
     regElastic = ElasticNetCV(cv=5, random_state=0, selection='random', l1_ratio=[.1, .5, .7, .9, .95, .99, 1]).fit(X_train, y_train.ravel())
     print('elastic net: ', regElastic.score(X_test, y_test))
     
     # decision tree regression
    train_path = p / "numpy_cnn/regression/hdf_files/train" / fname_train

    fname_test = conf.Run_name + "/combined_test_2019-12-19_" + conf.Run_name + ".h5"
    test_path = p / "numpy_cnn/regression/hdf_files/test" / fname_test
    X_test, y_test = create_data_label(test_path)
    X_train, y_train = create_data_label(train_path)

    estimators = {
        "K-nn":
        KNeighborsRegressor(n_neighbors=1, algorithm="kd_tree", leaf_size=30),
        "RandomForest":
        RandomForestRegressor(n_estimators=1000, max_depth=1000),
        "Linear":
        LinearRegression(),
        "Ridge":
        RidgeCV(alphas=[1, 10, 100]),
        "Lasso":
        LassoCV()
    }

    mses = []
    r2s = []
    expl_vars = []
    maes = []
    maxers = []
    #modelnames=[]
    for est in estimators.keys():
        print("keys ", est)
        #print("values ", estimators[est])
        #CalcTimes(estimators[est], X_train, X_test, y_train, y_test,model_name=est)
ridge2 = Ridge(alpha=0, normalize=True)
ridge2.fit(X_train, y_train)  # Fit a ridge regression on the training data
pred = ridge2.predict(X_test)  # Use this model to predict the test data
print(pd.Series(ridge2.coef_, index=X.columns))  # Print coefficients
print(mean_squared_error(y_test, pred))  # Calculate the test MSE

# It looks like we are indeed improving over regular least-squares!
#
# Instead of arbitrarily choosing alpha $ = 4$, it would be better to
# use cross-validation to choose the tuning parameter alpha. We can do this using
# the cross-validated ridge regression function, `RidgeCV()`. By default, the function
# performs generalized cross-validation (an efficient form of LOOCV), though this can be changed using the
# argument `cv`.

ridgecv = RidgeCV(alphas=alphas,
                  scoring='neg_mean_squared_error',
                  normalize=True)
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

# Therefore, we see that the value of alpha that results in the smallest cross-validation
# error is 0.57. What is the test MSE associated with this value of
# alpha?

ridge4 = Ridge(alpha=ridgecv.alpha_, normalize=True)
ridge4.fit(X_train, y_train)
mean_squared_error(y_test, ridge4.predict(X_test))

# This represents a further improvement over the test MSE that we got using
# alpha $ = 4$. Finally, we refit our ridge regression model on the full data set,
# using the value of alpha chosen by cross-validation, and examine the coefficient
Exemple #34
0
    def create_model(self, pred_year, games):
        """
        Generates spread predictions based on model input in given year range,
        year of prediction, and games to be predicted.
        """
        # Check for proper ranges of years given.
        if self.year1 < 2000:
            return 'Data not available before 2000.'
        elif self.year2 <= self.year1:
            return 'Year 2 must be greater than year 1.'
        elif self.year2 >= pred_year:
            return 'Year 2 must be less than prediction year.'
        elif pred_year > datetime.now().year:
            return 'Prediction year must be less than or equal to current year.'

        # Determine if all games are in DI-A, and are in proper format.
        # Refer to current_conf for teams available.
        for game in games:
            for team in game:
                if team not in [
                        x for k, v in CFB.current_conf.items() for x in v
                ]:
                    return '{} either not D1-A team or not in proper format.'.format(
                        team)

        # Generate input values for model. Set X as everything excluding spread, and y as spread.
        input_values = CFB(self.year1, self.year2).data_input()
        X = input_values.iloc[:, 1:]
        y = input_values['Home Spread']

        # Generate models, with 5 folds, set max_iter to 10000 for lasso, and fit to data.
        lasso_mod = LassoCV(cv=5, max_iter=10000).fit(X, y)
        ridge_mod = RidgeCV(cv=5).fit(X, y)

        # Generate values for generating predictions, and create predictions.
        pred_values = CFB(self.year1, self.year2).pred_input(pred_year, games)
        lasso_pred = lasso_mod.predict(pred_values)
        ridge_pred = ridge_mod.predict(pred_values)

        # Create result dictionary, indicating home and away teams, predicted winners, and spread.
        results = {
            'Away': [x[0] for x in games],
            'Home': [x[1] for x in games],
            'Lasso Predicted Winner': [
                games[i][0] if lasso_pred[i] > 0 else games[i][1]
                for i in range(len(games))
            ],
            'Ridge Predicted Winner': [
                games[i][0] if ridge_pred[i] > 0 else games[i][1]
                for i in range(len(games))
            ],
            'Lasso Spread': [-abs(round(x, 1)) for x in lasso_pred],
            'Ridge Spread': [-abs(round(x, 1)) for x in ridge_pred]
        }

        # Create dataframe based on dictionary, create index, and save as csv.
        results = pd.DataFrame(results)
        index = pd.Index(
            ['Game {}'.format(num) for num in range(1,
                                                    len(games) + 1)])
        results.index = index
        results.to_csv('CFB_games_results.csv')
        return results
# features used for predictions
features = data[list(data.columns)[5:126]]
# value to be predicted (number of violent crimes)
goal = data[list(data.columns)[127]]

# plenty of values are missing in the end of features vector (at indices around 115)
# therefore we will eliminate columns where at least one sample has missing data
features = features.dropna(axis=1)

alpha_values = []
for a in range(1, 10001):
    alpha_values.append(a / 100)

print "Started at " + str(datetime.now())

estimator_ridge = RidgeCV(alphas=alpha_values, cv=3)
estimator_ridge.fit(features, goal)
scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5)
print "Ridge alpha " + str(estimator_ridge.alpha_)
print str(np.mean(scores))
print scores

estimator_lasso = LassoCV(alphas=alpha_values, cv=3)
estimator_lasso.fit(features, goal)
scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5)
print "Lasso alpha " + str(estimator_lasso.alpha_)
print str(np.mean(scores))
print scores


estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1)
Exemple #36
0
        with warnings.catch_warnings():  # for clean output
            warnings.simplefilter('ignore', ConvergenceWarning)
            warnings.simplefilter('ignore', RuntimeWarning)
            os.environ[
                "PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses (n_jobs > 1)

            X, Y, B, Sigma, Sigma_X = generate_data(rho=rho,
                                                    **simulation_params)
            # MrRCE
            mrrce = MrRCE(glasso_max_iter=200, max_iter=150, tol_glasso=1e-3)
            mrrce.fit(X, Y)
            # OLS
            lm = LinearRegression(fit_intercept=False).fit(X, Y)
            B_ols = np.matrix(lm.coef_.transpose())
            # Ridge
            ridge = RidgeCV(fit_intercept=False).fit(X, Y)
            B_ridge = np.matrix(ridge.coef_.transpose())
            # Group Lasso
            gl = MultiTaskLassoCV(fit_intercept=False, cv=3).fit(X, Y)
            B_gl = np.matrix(gl.coef_.T)
            # Results
            results.append(
                dict(rho=rho,
                     rho_hat=mrrce.rho,
                     sigma_hat=mrrce.sigma,
                     MrRCE=model_error(B, mrrce.Gamma, Sigma_X),
                     OLS=model_error(B, B_ols, Sigma_X),
                     Ridge=model_error(B, B_ridge, Sigma_X),
                     GroupLasso=model_error(B, B_gl, Sigma_X)))
            convergence_results.append(
                dict(
Exemple #37
0
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()  # 取出y,并且从二维降成一维

# output_distribution='normal'将转换后数据映射到正态分布
y_trans = quantile_transform(dataset.data[:, target],
                             n_quantiles=300,
                             output_distribution='normal',
                             copy=True).squeeze()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# 岭回归
# Ridge/RidgeCV:使用结构风险最小化=损失函数(平方损失)+正则化(L2范数)
# Ridge:固定alpha,求出最佳w,alpha与w的范数成反比
# RidgeCV:多个alpha,得出多个对应最佳的w,然后得到最佳的w及对应的alpha
regr = RidgeCV()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# 画图
f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

# 画图一
ax0.scatter(y_test, y_pred)
ax0.set_xlabel('True Target')
ax0.set_ylabel('Target predicted')
ax0.plot([0, 10], [0, 10], '--k')
ax0.text(
    1, 9, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 10])
# Utilities
#-----------------------------------------------------
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Training
#-----------------------------------------------------
print(datetime.now(), "START MODELING")
kfolds = KFold(n_splits=10, shuffle=True, random_state=0)
# SETUP 

alphas_ridge = [ 15.5,]
alphas_lasso = [0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]

RIDGE = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_ridge, cv=kfolds))

BRIDGE = make_pipeline(RobustScaler(),
                      BayesianRidge(alpha_2=88.0, lambda_1=6.5, lambda_2=0.4, n_iter=1000000))

LASSO = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas_lasso,
                              random_state=0, cv=kfolds))

GBMR = GradientBoostingRegressor(
                                n_estimators=3000, 
                                learning_rate=0.05,
                                max_depth=4, 
                                max_features='sqrt',
                                min_samples_leaf=15, 
                                min_samples_split=10,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

ridge_alphas = [
    1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1,
    0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100
]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003))

# In[35]:

scores = {}

# In[36]:

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

# In[37]:
Exemple #40
0
def regression_estimator_RidgeCV(stages=[]):
    stages.extend(
        [regression_standardisation(), ('estimator', RidgeCV(cv=10))])

    return Pipeline(stages)
Exemple #41
0
#这里由于样本数不多,SGDRegressor可能不如LinearRegression。 sklearn建议样本数超过10万采用SGDRegressor


# ### 3.2 正则化的线性回归(L2正则 --> 岭回归)

# In[16]:


#岭回归/L2正则
#class sklearn.linear_model.RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, 
#                                  normalize=False, scoring=None, cv=None, gcv_mode=None, 
#                                  store_cv_values=False)
from sklearn.linear_model import  RidgeCV

alphas = [0.01, 0.1, 1, 10,20, 40, 80,100]
reg = RidgeCV(alphas=alphas, store_cv_values=True)   
reg.fit(X_train, y_train)       


# In[17]:

print('reg.cv_values_ type=',type(reg.cv_values_));
cv_values=reg.cv_values_;
mse_mean = np.mean(reg.cv_values_, axis = 0)
print(mse_mean)
plt.plot(np.log10(alphas), mse_mean.reshape(len(alphas),1)) 
plt.plot(np.log10(reg.alpha_)*np.ones(3), [0.28, 0.29, 0.30])
plt.xlabel('log(alpha)')
plt.ylabel('mse')
plt.show()
        0.001,
        0.005,
        0.01,
        0.015,
    ]
}
optimal3 = GridSearchCV(model3, param_grid=tune_params3, cv=5,
                        n_jobs=4).fit(xtr, ytr)
print(optimal3.best_params_)  # {'alpha': 0.001}
optimal_model3 = optimal3.best_estimator_
ypred3 = optimal_model3.predict(xte)
res3 = mean_squared_error(yte, ypred3)
print('Optimal Ridge MSE', res3)

# The best model is selected by cross-validation
model4 = RidgeCV(alphas=(0.0, 1.0, 300.0), cv=5)  # 类似于网格搜索最优alpha
model4.fit(xtr, ytr)
print(model4.alpha_
      )  # The amount of penalization chosen by cross validation  0.0
ypred4 = model4.predict(xte)
res4 = mean_squared_error(yte, ypred4)
print('RidgeCV MSE', res4)

print()
model5 = ElasticNet()
tune_params5 = {
    'alpha': [0.11, 0.13, 0.15, 0.17, 0.19],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
optimal5 = GridSearchCV(model5, param_grid=tune_params5, cv=5,
                        n_jobs=4).fit(xtr, ytr)
#importar libreria para graficos de visualizacion
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

alphas = np.linspace(0.01, 0.5)
f, ax = plt.subplots()
x = np.linspace(0, 2 * np.pi)
y = np.sin(x)
# añadimos algo de ruido
xr = x + np.random.normal(scale=0.1, size=x.shape)
yr = y + np.random.normal(scale=0.2, size=y.shape)
ax.plot(x, np.sin(x), 'r', label='sin ruido')
ax.scatter(xr, yr, label='con ruido')
# convertimos nuestro array en un vector columna
X = xr[:, np.newaxis]
# utilizamos un bucle para probar polinomios de diferente grado
for degree in [3, 4, 5]:
    # utilizamos Pipeline para crear una secuencia de pasos
    model = make_pipeline(PolynomialFeatures(degree), RidgeCV(alphas=alphas))
    model.fit(X, y)
    y = model.predict(x[:, np.newaxis])
    ax.plot(x, y, '--', lw=2, label="degree %d" % degree)
ax.legend()
plt.show()
Exemple #44
0
X_train = train[:, :np.ceil(0.5 * n_pixels)]  # Upper half of the faces
y_train = train[:, np.floor(0.5 * n_pixels):]  # Lower half of the faces
X_test = test[:, :np.ceil(0.5 * n_pixels)]
y_test = test[:, np.floor(0.5 * n_pixels):]

# Fit estimators
ESTIMATORS = {
    "Extra trees":
    ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0),
    "K-nn":
    KNeighborsRegressor(),
    "Linear regression":
    LinearRegression(),
    "Ridge":
    RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict[name] = estimator.predict(X_test)

# Plot the completed faces
image_shape = (64, 64)
n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators", size=16)
for i in range(n_faces):
    true_face = np.hstack((X_test[i], y_test[i]))
    if i:
Exemple #45
0
#print(np.random.randn(N))
print(x)
y=1.8*x**3+x**2-14*x-7+np.random.randn(N)
#将其设置为矩阵
x.shape=-1,1
y.shape=-1,1

#RidgeCV与Ridge的区别是:前者可以进行交叉验证
models=[
    Pipeline([
        ('Poly',PolynomialFeatures(include_bias=False)),
        ('Linear',LinearRegression(fit_intercept=False))
    ]),
    Pipeline([
        ('Poly',PolynomialFeatures(include_bias=False)),
        ('Linear',RidgeCV(alphas=np.logspace(-3,2,50),fit_intercept=False))
    ]),
    # # alpha给定的是Ridge算法中,L2正则项的权重值,也就是ppt中的兰姆达
            # alphas是给定CV交叉验证过程中,Ridge算法的alpha参数值的取值的范围
    Pipeline([
        ('Poly',PolynomialFeatures(include_bias=False)),
        ('Linear',LassoCV(alphas=np.logspace(0,1,10),fit_intercept=False))
    ]),
    Pipeline([
        ('Poly',PolynomialFeatures(include_bias=False)),
        # l1_ratio:给定EN算法中L1正则项在整个惩罚项中的比例,这里给定的是一个列表;
        # l1_ratio:也就是ppt中的p  p的范围是[0, 1]
        # alphas也就是ppt中的兰姆达
        # alphas表示的是在CV交叉验证的过程中,EN算法L1正则项的权重比例的可选值的范围
        ('Linear',ElasticNetCV(alphas=np.logspace(0,1,10),l1_ratio=[0.1,0.5,.7,.9,.95,1.0],fit_intercept=False))
    ])
Exemple #46
0
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging1', model5, metrics1))
        else:
            model5 = LassoLarsCV(cv=scv)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('LassoLarsCV', model5, metrics1))
        model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                         cv=scv,
                         random_state=seed)
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = rmse(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('LassoCV', model6, metrics2))
        model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv)
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = rmse(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('RidgeCV', model7, metrics3))
        ## Create an ensemble model ####
        if Boosting_Flag:
            model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging2', model8, metrics4))
        else:
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                min_samples_leaf=2, max_depth=1, random_state=seed),
                                       n_estimators=NUMS,
                                       random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if verbose >= 2:
            print('QuickML_Ensembling Model results:')
            print(
                '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                % (estimator_names[0], metrics1, estimator_names[1], metrics2,
                   estimator_names[2], metrics3, estimator_names[3], metrics4))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging', model5, metrics1))
        else:
            model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20),
                                          cv=scv,
                                          scoring=scoring,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Logistic Regression', model5, metrics1))
        model6 = LinearDiscriminantAnalysis()
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = accu(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('Linear Discriminant', model6, metrics2))
        if modeltype == 'Binary_Classification':
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = GaussianNB()
        else:
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = MultinomialNB()
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = accu(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('Naive Bayes', model7, metrics3))
        if Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here.
            model8 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging', model8, metrics4))
        else:
            ## Create an ensemble model ####
            model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if not isinstance(y_test, str):
            if verbose >= 2:
                print('QuickML_Ensembling Model results:')
                print(
                    '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                    % (estimator_names[0], metrics1, estimator_names[1],
                       metrics2, estimator_names[2], metrics3,
                       estimator_names[3], metrics4))
        else:
            if verbose >= 1:
                print('QuickML_Ensembling completed.')
    stacks = np.c_[results1, results2, results3, results4]
    if verbose == 1:
        print('    Time taken for Ensembling: %0.1f seconds' %
              (time.time() - start_time))
    return estimator_names, stacks


#########################################################
Exemple #47
0
    dataset_level_res = defaultdict(list)
    cell_level_res = defaultdict(list)
    models = []
    np.random.seed(42)

    print("computing predictions for gb + rf + svm")
    for model_type in ['gb', 'rf', 'ridge', 'svm']:

        if model_type == 'rf':
            m = RandomForestRegressor(n_estimators=100, random_state=1)
        elif model_type == 'dt':
            m = DecisionTreeRegressor()
        elif model_type == 'linear':
            m = LinearRegression()
        elif model_type == 'ridge':
            m = RidgeCV()
        elif model_type == 'svm':
            m = SVR(gamma='scale')
        elif model_type == 'gb':
            m = GradientBoostingRegressor(random_state=1)

        for feat_set in ['basic', 'dasc']:
            models.append(f'{model_type}_{feat_set}')
            if feat_set == 'basic':
                feat_set = feat_names[1:]
            elif feat_set == 'dasc':
                feat_set = ['X_d1', 'X_d2', 'X_d3']

            m.fit(df_full[feat_set], df_full['Y_sig_mean_normalized'].values)

            for i, (k, v) in enumerate(ds.keys()):
Exemple #48
0
 def setUp(self):
     super(TestRidgeCV, self).setUp()
     # Provide three candidates for alpha.
     self.var = VAR(10, RidgeCV(alphas=[10, 100, 1000]))
Exemple #49
0
def loo_sklearn(X,y, regparam):
    learner = RidgeCV(alphas = [regparam], store_cv_values = True, fit_intercept=False)
    learner.fit(X,y)
    e = np.mean(learner.cv_values_[:,:,0])
    return e
Exemple #50
0
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)


@pytest.mark.parametrize(
    "estimator",
    [None, DummyRegressor(),
     BayesianRidge(),
     ARDRegression(),
     RidgeCV()])
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
Exemple #51
0
	ar1=[]
	y=[]
	"""
	bootstrap sample the x and y arrays
	"""
	for l in range(len(bvar)):
		ind=int(uni(0, 1)*len(bvar))
		ar.append(bvar[ind][1])
		ar1.append(bvar[ind][2])
		y.append(bvar[ind][0])
	#write as arrays, stack them 
	ar=np.array(ar); ar1=np.array(ar1); y=np.array(y)
	A=np.vstack([ar, ar1, np.ones(len(bvar))]).T
	
	#cross-validate the ridge regression 
	cl=RidgeCV(alphas=[0.5, 1.0, 50.0, 500.0])
	#cl=Ridge(alpha=1.0)
	cl.fit(A, y)
	#if cl.coef_[0]>=0:
	i+=1

	#arrays for predicted values and for the a, b, c coefficients	
	val_arr.append(cl.predict([32.21, 31.01, 1.]))
	coef_arr.append([cl.coef_[0], cl.coef_[1], cl.intercept_])

print 'The mean and standard deviation for this object is '
print np.std(val_arr), np.mean(val_arr)
coef_arr=np.array(coef_arr)
print "Coefficients of the ridge and their standard deviations "
print np.mean(coef_arr[:,0]), np.std(coef_arr[:,0]), np.mean(coef_arr[:,1]), np.std(coef_arr[:,1]), np.mean(coef_arr[:,2]), np.std(coef_arr[:,2])
Exemple #52
0
 if (name == 'LinearRegression'):
     model.fit(X_train, y_train)
     result = np.sqrt(-cross_val_score(
         model,  # bu da test icin rmse'li cross validation
         X_test,
         y_test,
         cv=10,
         scoring="neg_mean_squared_error")).mean()
     results.append(result)
     names.append(name)
     msg = "%s: %f" % (name, result)
     print(msg)
 if (name == 'Ridge'):
     lambdalar = 10**np.linspace(10, -2, 100) * 0.5
     ridge_cv = RidgeCV(alphas=lambdalar,
                        scoring="neg_mean_squared_error",
                        normalize=True)
     ridge_cv.fit(X_train, y_train)
     ridge_tuned = Ridge(alpha=ridge_cv.alpha_,
                         normalize=True).fit(X_train, y_train)
     result = np.sqrt(
         mean_squared_error(y_test, ridge_tuned.predict(X_test)))
     results.append(result)
     names.append(name)
     msg = "%s: %f" % (name, result)
     print(msg)
 if (name == 'Lasso'):
     lasso_cv_model = LassoCV(alphas=None,
                              cv=10,
                              max_iter=10000,
                              normalize=True)
Exemple #53
0
                pred1_valid, pred1_holdout, pred1=xgb_train(ind_valid,ind_holdout,log_ind,train_X,train_Y,valid_X,valid_Y,holdout_X,holdout_Y,
                    param,num_round,y_pow)

                X_mat_test[:,iind]=np.log(pred1.ravel())
                X_mat_valid[:,iind]=np.log(pred1_valid.ravel())
                X_mat_holdout[:,iind]=np.log(pred1_holdout.ravel())

                rmse_valid_mat[i,iind+1]=rmse_log(valid_Y,pred1_valid)
                rmse_holdout_mat[i,iind+1]=rmse_log(holdout_Y,pred1_holdout)
                iind+=1
        ####################################################################################
        ####################################################################################
        alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0]
    
        RidgeModel = RidgeCV(alphas=alphas, normalize=True, cv=5)

        Ridge_fit=RidgeModel.fit(X_mat_valid,np.log(valid_Y))
        preds_test_Ridge=np.exp(Ridge_fit.predict(X_mat_test))
        preds_test_mat_Ridge[:,i]=preds_test_Ridge.ravel()
        preds_valid_Ridge=np.exp(Ridge_fit.predict(X_mat_valid))
        preds_holdout_Ridge=np.exp(Ridge_fit.predict(X_mat_holdout))
        preds_holdout_mat_Ridge[:,i]=preds_holdout_Ridge.ravel()

        rmse_valid_blend[i,0]=i
        rmse_valid_blend[i,1]=rmse_log(valid_Y,preds_valid_Ridge)
        rmse_holdout_blend[i,0]=i
        rmse_holdout_blend[i,1]=rmse_log(holdout_Y,preds_holdout_Ridge)
        ####################################################################################
        ####################################################################################
        LRmodel=LinearRegression(
#from yellowbrick.regressor import PredictionError
#ridger5 = Ridge(alpha=5)
#visualizer = PredictionError(ridger5)

#visualizer.fit(m2_pch_train_strs_broken, y_pch_train)  # Fit the training data to the visualizer
#visualizer.score(m2_pch_test_strs_broken, y_pch_test)  # Evaluate the model on the test data
#visualizer.show()                 # Finalize and render the figure

# =============================================================================
# Ridge-CrossValidation (Leave One-Out/Generalized)
# =============================================================================
from sklearn.linear_model import RidgeCV

ridgerCV = RidgeCV(
    alphas=np.array([0.25, 0.5, 0.75, 1, 1.25, 1.5, 2, 3, 4, 5]),
    store_cv_values=True
)  #RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=False,
#gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)
ridgerCV.fit(m2_pch_train_strs_broken, y_pch_train)
y_pch_pred_ridgeCV_m2 = ridgerCV.predict(m2_pch_test_strs_broken)

r2_ridgeregCV = r2_score(y_pch_test, y_pch_pred_ridgeCV_m2)
adjR2_ridgeregCV = adjusted_r2(r2_ridgeregCV, 2048, len(y_pch_test))
mse_ridgeregCV = mean_squared_error(y_pch_test, y_pch_pred_ridgeCV_m2)

print('R^2 score: ', r2_ridgeregCV)
print('Adjusted R^2 score: ', adjR2_ridgeregCV)
print('Mean squared error: ', mse_ridgeregCV)

#W/cv=5
ridgerCV5 = RidgeCV(
F= int(MN[0])
N = int(MN[1])

rowindex=0
rows=[]
ys=[]
while rowindex<N:
  rowindex = rowindex+1;
  data =raw_input().split()
  feature = [float(data[0]),float(data[1])]
  #print np.vander(feature,5).flatten()
  rows.append(np.vander(feature,5).flatten())
  ys.append(float(data[-1]))

#print rows
ridge = RidgeCV(alphas=[0.1,1.0,10.0])
ridge.fit(rows,ys)

print ridge.alpha_
print ridge.coef_
print ridge.intercept_


predictNum = int(raw_input())
rowindex=0
rows=[]
while rowindex<predictNum:
  rowindex = rowindex+1;
  data =raw_input().split()
  feature = [float(data[0]),float(data[1])]
  rows.append(np.vander(feature,5).flatten())
                                                    random_state=42)
training = df.copy()

ridge_model = Ridge(alpha=0.1).fit(X_train, y_train)
lambdalar = 10**np.linspace(10, -2, 100) * 0.5
ridge_model = Ridge()
katsayilar = []
for i in lambdalar:
    ridge_model.set_params(alpha=i)
    ridge_model.fit(X_train, y_train)
    katsayilar.append(ridge_model.coef_)
"""ax=plt.gca()
ax=plot(lambdalar,katsayilar)
ax.set_xscale('log')
plt.xlabel('lambda(alpha) Değerleri')
plt.ylabel('Katsayılar/Ağırlıklar')
plt.title('Düzemleştirmenin bir fonksiyonu olarak Ridge Katsayıları');
plt.show()"""

y_pred = ridge_model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
ridge_cv_model = RidgeCV(alphas=lambdalar,
                         scoring='neg_mean_squared_error',
                         normalize=True)
ridge_cv_model.fit(X_train, y_train)
print(ridge_cv_model.alpha_)
ridge_tuned = Ridge(alpha=ridge_cv_model.alpha_, normalize=True)
ridge_tuned.fit(X_train, y_train)
y_pred = ridge_tuned.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
X = preprocessing.scale(X)

# create features for predict
X_pred = X[-predPeriod:]

X = X[:-predPeriod] #re-sizing the features for training
dataset.dropna(inplace=True) # get rid of naN for 'label' column

# create label 
y = np.array(dataset['label'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1)

# use linearRegression as algrithm
#clf = LinearRegression()
clf = RidgeCV (alphas =[0.1, 0.5, 1, 10])
clf.fit(X_train, y_train)
#start_time = time.time()
y_pred = clf.predict(X_pred)
#print time.time() - start_time
accuracy = clf.score(X_test, y_test)
# visualize Learning Curves
#ML.ModelLearning(X, y)
#ML.ModelComplexity(X_train, y_train)

#Linear slope calculation
#print clf.alpha_
#print clf
#print clf.coef_
#print clf.intercept_
print 'predict accuracy is: {:0.2f}'.format(accuracy)
Exemple #58
0
y_pred_lasso_test = model_lasso.predict(X_test)

coef = pd.Series(model_lasso.coef_, index=X_train.columns)

# Results
lasso_r2_train, lasso_r2_test, lasso_mae_train, lasso_mae_test, lasso_rmse_train, lasso_rmse_test = results_func(
    'Lasso:', y_pred_lasso_train, y_pred_lasso_test)
print("Lasso best alpha :", alpha_l)
print("\nLasso picked " + str(sum(coef != 0)) +
      " variables and eliminated the other " + str(sum(coef == 0)) +
      " variables")
'''RIDGE REGRESSION'''
# Compute the cross-validation score with default hyper-parameters
# Create instance
ridgeCV = RidgeCV(alphas=[
    1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1,
    0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100
])
# Fit the model on the training set
model_ridge = ridgeCV.fit(X_train, y_train)
alpha = model_ridge.alpha_
# Predict
y_pred_ridge_train = model_ridge.predict(X_train)
# Test
y_pred_ridge_test = model_ridge.predict(X_test)
# Results
ridge_r2_train, ridge_r2_test, ridge_mae_train, ridge_mae_test, ridge_rmse_train, ridge_rmse_test = results_func(
    'Ridge:', y_pred_ridge_train, y_pred_ridge_test)
print("Ridge best alpha :", alpha)
'''RANDOM FOREST REGRESSOR (tuned using RandomizedSearchCV and GridSearchCV)'''
# Create instance
rf = RandomForestRegressor(bootstrap=True,
Exemple #59
0
stock=data.loc[:,'Stock'].values[idx]

## create train set and test set
from sklearn.cross_validation import train_test_split

train, test, train_ret, test_ret, train_stock, test_stock = \
    train_test_split(inst, ret, stock, test_size=0.4, random_state=1)

# SVR modeling
from sklearn.svm import SVR
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import RFE

rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
poly = SVR(kernel='poly', C=1e3, degree=2)
rig=RidgeCV()

rig.fit(train, train_ret)
rig.coef_
test_predict=rig.predict(test)
hits= ((test_ret>0) & (test_predict>0)) | ((test_ret<0) & (test_predict<0))
hit_ratio=1.0*sum(hits)/len(test_ret)


plt.figure(2)
plt.subplot(1,2,1)
plt.plot(test_ret, 'ko')
plt.plot(test_predict, 'ro')
plt.ylim([-1,1])
plt.xlim([0,len(test_ret)])
plt.plot([0,100],[0,0],'g--')
Exemple #60
0
    warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
    np.random.seed(0)
    np.set_printoptions(linewidth=300)
    N = 9
    x = np.linspace(0, 6, N) + np.random.randn(N)
    x = np.sort(x)
    y = x**2 - 4 * x - 3 + np.random.randn(N)
    x.shape = -1, 1
    y.shape = -1, 1

    models = [
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear', LinearRegression(fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   RidgeCV(alphas=np.logspace(-3, 2, 10),
                           fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   LassoCV(alphas=np.logspace(-3, 2, 10),
                           fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   ElasticNetCV(alphas=np.logspace(-3, 2, 10),
                                l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                fit_intercept=False))])
    ]
    mpl.rcParams['font.sans-serif'] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    np.set_printoptions(suppress=True)

    plt.figure(figsize=(18, 12), facecolor='w')