Beispiel #1
0
def test_ard_regression_predict_normalize_true():
    """Check that we can predict with `normalize=True` and `return_std=True`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/18605
    """
    clf = ARDRegression(normalize=True)
    clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
    clf.predict([[1, 1]], return_std=True)
    def runARDRegressor(self):
        lm = ARDRegression(fit_intercept=True, normalize=True)

        print("runARDRegressor\n")
        lm.fit(self.m_X_train, self.m_y_train)
        predictY = lm.predict(self.m_X_test)
        score = lm.score(self.m_X_test, self.m_y_test)
        predictTraingY = lm.predict(self.m_X_train)

        self.displayPredictPlot(predictY)
        self.displayResidualPlot(predictY, predictTraingY)
        self.dispalyModelResult(lm, predictY, score)
Beispiel #3
0
def test_update_of_sigma_in_ard():
    # Checks that `sigma_` is updated correctly after the last iteration
    # of the ARDRegression algorithm. See issue #10128.
    X = np.array([[1, 0], [0, 0]])
    y = np.array([0, 0])
    clf = ARDRegression(n_iter=1)
    clf.fit(X, y)
    # With the inputs above, ARDRegression prunes both of the two coefficients
    # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
    assert clf.sigma_.shape == (0, 0)
    # Ensure that no error is thrown at prediction stage
    clf.predict(X, return_std=True)
Beispiel #4
0
def _ard(*,
         train,
         test,
         x_predict=None,
         metrics,
         n_iter=300,
         tol=0.001,
         alpha_1=1e-06,
         alpha_2=1e-06,
         lambda_1=1e-06,
         lambda_2=1e-06,
         compute_score=False,
         threshold_lambda=10000.0,
         fit_intercept=True,
         normalize=False,
         copy_X=True,
         verbose=False):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html#sklearn.linear_model.ARDRegression
    """

    model = ARDRegression(n_iter=n_iter,
                          tol=tol,
                          alpha_1=alpha_1,
                          alpha_2=alpha_2,
                          lambda_1=lambda_1,
                          lambda_2=lambda_2,
                          compute_score=compute_score,
                          threshold_lambda=threshold_lambda,
                          fit_intercept=fit_intercept,
                          normalize=normalize,
                          copy_X=copy_X,
                          verbose=verbose)
    model.fit(train[0], train[1])
    model_name = 'ARDRegression'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
class ARDRegressionPrim(primitive):
    def __init__(self, random_state=0):
        super(ARDRegressionPrim, self).__init__(name='ARDRegression')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "Bayesian ARD regression. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions. Also estimate the parameters lambda (precisions of the distributions of the weights) and alpha (precision of the distribution of the noise). The estimation is done by an iterative procedures (Evidence Maximization)"
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = ARDRegression()
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'],
                                   columns=[self.name + "Pred"])
        final_output = {0: output}
        return final_output
def ard_regression(train, test):
    train = train.copy()
    test = test.copy()

    X = train.to_numpy()
    X_train = np.delete(X, [train.columns.get_loc('views')], axis=1)
    y_train = train['views']

    X = test.to_numpy()
    X_test = np.delete(X, [test.columns.get_loc('views')], axis=1)
    y_test = test['views']

    reg = ARDRegression(compute_score=True)
    reg.fit(X_train, y_train)

    y_pred = reg.predict(X_test)

    # The mean squared error
    print('Mean squared error: %.2f' %
          mean_squared_error(y_test, y_pred, squared=True))
    # The coefficient of determination: 1 is perfect prediction
    print('median absolute error: %.2f' %
          median_absolute_error(y_test, y_pred))

    return None
def ARDRegression_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    '''
    '''
    clf = ARDRegression()
    clf.fit(X[train], y[train][:, 0])
    y_pred = clf.predict(X[test])[:, None]
    return y_pred, clf
Beispiel #8
0
def ARDRegression_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    '''
    '''
    clf = ARDRegression()
    clf.fit(X[train], y[train][:, 0])
    y_pred = clf.predict(X[test])[:, None]
    return y_pred, clf
Beispiel #9
0
def test_return_std():
    # Test return_std option for both Bayesian regressors
    def f(X):
        return np.dot(X, w) + b

    def f_noise(X, noise_mult):
        return f(X) + np.random.randn(X.shape[0]) * noise_mult

    d = 5
    n_train = 50
    n_test = 10

    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
    b = 1.0

    X = np.random.random((n_train, d))
    X_test = np.random.random((n_test, d))

    for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
        y = f_noise(X, noise_mult)

        m1 = BayesianRidge()
        m1.fit(X, y)
        y_mean1, y_std1 = m1.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)

        m2 = ARDRegression()
        m2.fit(X, y)
        y_mean2, y_std2 = m2.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
Beispiel #10
0
class ARDR():
    """docstring for ClassName"""
    def __init__(self, ARDRegression, N):
        self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N))
        self.selected_columns = []
        self.model = ARDRegression(
                        alpha_1=1e-06, 
                        alpha_2=1e-06, 
                        compute_score=False, 
                        copy_X=True,
                        fit_intercept=True, 
                        lambda_1=1e-06, 
                        lambda_2=1e-06, 
                        n_iter=300,
                        normalize=False, 
                        threshold_lambda=10000.0, 
                        tol=0.001, verbose=False)


        print("ARDRegression Cores: ", np.nan)

    def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"):
        try:
            self.selected_columns = np.random.choice(X_train.columns, 100, replace = False)
            X_train = X_train[self.selected_columns]
        except Exception as E:
            X_train = X_train
              
        error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" }
        error_metric = error_dict[error_type]
        self.model.fit(X_train, y_train )

    def predict(self, X_test):
         prediction=self.model.predict(X_test[self.selected_columns])
         return(prediction)
Beispiel #11
0
    def autorelevancedetermination(self):
        # Fit the ARD Regression
        clf = ARDRegression(compute_score=True)
        clf.fit(self.x_train, self.y_train)
        z = clf.predict(self.x_test)
        print(np.mean(self.y_test == z))

        return z
Beispiel #12
0
def test_toy_ard_object():
    # Test BayesianRegression ARD classifier
    X = np.array([[1], [2], [3]])
    Y = np.array([1, 2, 3])
    clf = ARDRegression(compute_score=True)
    clf.fit(X, Y)

    # Check that the model could approximately learn the identity function
    test = [[1], [3], [4]]
    assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
Beispiel #13
0
def ARD(X_train, y_train, X_test, y_test):
    '''
        Purpose: Use ARD to calculate accuracy
        Input: X_train, y_train, X_test, y_test
        Output: accuracy_score
   '''
    clf = ARDRegression(compute_score=True)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred = y_pred.round()
    #ols = LinearRegression()
    #ols.fit(X, y)
    return metrics.accuracy_score(y_test, y_pred)
Beispiel #14
0
    def fit_model_16(self,toWrite=False):
        model = ARDRegression()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 16 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model16/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Beispiel #15
0
class _ARDRegressionImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Beispiel #16
0
    def predict(self):
        """
         trains the scikit-learn  python machine learning algorithm library function
         https://scikit-learn.org

         then passes the trained algorithm the features set and returns the
         predicted y test values form, the function

         then compares the y_test values from scikit-learn predicted to
         y_test values passed in

         then returns the accuracy
         """

        algorithm = ARDRegression(threshold_lambda=1e5)
        algorithm.fit(self.X_train, self.y_train)
        y_pred = list(algorithm.predict(self.X_test))
        self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test)
        return self.acc
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['BR_pca'] = sumsum / float(result_row)
 rs_score['BR_pca'] = r2_score(y_test, y)
 BRModel = BayesianRidge()
 BRModel.fit(X_train_std, y_train)
 y = BRModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['BR_std'] = sumsum / float(result_row)
 rs_score['BR_std'] = r2_score(y_test, y)
 ARDModel = ARDRegression()
 ARDModel.fit(X_train_pca, y_train)
 y = ARDModel.predict(X_test_pca)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['ARD_pca'] = sumsum / float(result_row)
 rs_score['ARD_pca'] = r2_score(y_test, y)
 ARDModel = ARDRegression()
 ARDModel.fit(X_train_std, y_train)
 y = ARDModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
Beispiel #18
0
class Bayesian_Linear_Model:
    """Bayesian linear regression object compatible with the BO framework.
    
    Model implemented using scikit-learn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html#sklearn.linear_model.ARDRegression
    """
    
    def __init__(self, X, y, **kwargs):
        """
        Parameters
        ----------
        X : list, numpy.array, pandas.DataFrame
            Domain points to be used for model training.
        y : list, numpy.array, pandas.DataFrame
            Response values to be used for model training.
        """
        
        # CV set gamma prior parameters - no GS for now
        self.alphas = np.logspace(-6, 0.5, 7)
        
        # Initialize model
        self.model = ARDRegression(n_iter=50)
        
        # Make sure X and y are numpy arrays
        self.X = np.array(X)
        self.y = np.array(y)
        
    # Fit    
    def fit(self):
        """Train the model using grid search CV.""" 
        
        parameters = [{'alpha_1': self.alphas, 'alpha_2': self.alphas}]
        
        # Set the number of folds
        if len(self.X) < 5:
            n_folds = len(self.X)
        else:
            n_folds = 5
        
        # Run grid search
        if n_folds > 1:
        
            # Select l1 term via grid search
            self.grid_search = GridSearchCV(self.model, 
                                       parameters, 
                                       cv=n_folds, 
                                       refit=True,
                                       n_jobs=-1)
        
            self.grid_search.fit(self.X, self.y)
        
            # Set model to trained model
            self.model = self.grid_search.best_estimator_
        
        # Just fit model
        else:
            self.model.fit(self.X, self.y)
            
    def get_scores(self):
        """Get grid search cross validation results.
        
        
        Returns
        ----------
        (numpy.array, numpy.array)
            Average scores and standard deviation of scores for grid.
        """ 
        
        # Plot results
        scores = self.grid_search.cv_results_['mean_test_score']
        scores_std = self.grid_search.cv_results_['std_test_score']
        
        return scores, scores_std
        
    # Predict   
    def predict(self, points):
        """Model predictions.
        
        Parameters
        ----------
        points : list, numpy.array, pandas.DataFrame
            Domain points to be evaluated.
        
        Returns
        ----------
        numpy.array
            Predicted response values at points.
        """ 
        
        # Make sure points in a numpy array
        points = np.array(points)
        
        # Make predicitons
        pred = self.model.predict(points)
        
        return pred
        
    # Regression   
    def regression(self, return_data=False, export_path=None, return_scores=False):
        """Helper method for visualizing the models regression performance.
        
        Generates a predicted vs observed plot using the models training data.
        
        Parameters
        ----------
        return_data : bool
            Return predicted responses.
        export_path : None, str
            Export SVG image of predicted vs observed plot to export_path.
                   
        Returns
        ----------
        matplotlib.pyplot 
            Scatter plot with computed RMSE and R^2.
        """

        pred = self.predict(self.X)
        obs = self.y        
        return pred_obs(pred, 
                        obs, 
                        return_data=return_data, 
                        export_path=export_path,
                        return_scores=return_scores) 
    
    # Estimate variance
    def variance(self, points):
        """Estimated variance of Bayesian linear model.
        
        Parameters
        ----------
        points : numpy.array
            Domain points to be evaluated.
        
        Returns
        ----------
        numpy.array
            Model variance at points.
        """
        
        # Make sure points in a numpy array
        points = np.array(points)
        
        # Make predicitons
        pred, std = self.model.predict(points, return_std=True)
        
        return std**2
Beispiel #19
0
def ard_regression(data_x,data_y):
    clf = ARDRegression(compute_score=True)
    clf.fit(data_x, data_y)
    predict_x = np.array(range(0,data_x.shape[0]))
    predict_x = np.reshape(predict_x,(data_x.shape[0],1))
    return predict_x,clf.predict(predict_x)
plt.title("Action unit weights")
plt.bar(au_coeff.index, au_coeff.values)
plt.xlabel("Action units")
plt.ylabel("Values of the weights")


#smooth function to make plotted data more human readable
def smooth(y, box_pts):
    box = np.ones(box_pts) / box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth


plt.figure(figsize=(6, 5))
plt.title("Predictions")
y_predict, y_std = clf.predict(x_valid, return_std=True)
axis = np.arange(0, n_samples_valid)
plt.plot(axis,
         y_predict,
         color='lightsteelblue',
         linewidth=0.5,
         linestyle='dotted',
         markersize=0.8,
         label="ARD",
         marker='.')
plt.plot(axis, smooth(y_predict, 100), color='navy', label="ARD smoothed")
plt.plot(axis, y_valid, color='gold', linewidth=2, label="Ground Truth")
plt.xlabel("Samples")
plt.ylabel("Valence")
plt.legend(loc='upper left', fontsize=8)
print("MAE")
gbr.fit(X, y)
br = BayesianRidge(compute_score=True)
br.fit(X, y)
ardr = ARDRegression(compute_score=True)
ardr.fit(X, y)
knn = neighbors.KNeighborsRegressor(5, weights='distance')
knn.fit(X, y)

# Predict using kernel ridge
X_plot = np.linspace(0, num + 10, 10000)[:, None]
y_kr = kr.predict(X_plot)
y_svr = svr.predict(X_plot)
y_abr = abr.predict(X_plot)
y_gbr = gbr.predict(X_plot)
y_br = br.predict(X_plot)
y_ardr = ardr.predict(X_plot)
y_knn = knn.predict(X_plot)

# Plot results
fig = plt.figure(figsize=(10, 5))
lw = 2
plt.scatter(X, y, c='k', s=5, label='data')
plt.plot(X_plot,
         y_kr,
         color='turquoise',
         lw=lw,
         label='KRR (%s)' % kr.best_params_)
# plt.plot(X_plot, y_svr, color='r', lw=lw, label='SVR (%s)' % svr.best_params_)
plt.plot(X_plot, y_abr, color='g', lw=lw, label='AdaBoostRegressor')
# plt.plot(X_plot, y_gbr, color='k', lw=lw, label='GradientBoostingRegressor')
# plt.plot(X_plot, y_br, color='k', lw=lw, label='BayesianRidge')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("datasets/studentscores.csv")
print dataset.shape
X = dataset.iloc[:, :1].values
Y = dataset.iloc[:, 1].values

from sklearn.cross_validation import train_test_split
# random_state 是一种伪随机参数
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.linear_model import ARDRegression
regressor = ARDRegression()
regressor = regressor.fit(X_train, Y_train)

Y_pred = regressor.predict(X_test)

plt.scatter(X_train, Y_train, color='red')
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.show()

plt.scatter(X_test, Y_test, color='red')
plt.plot(X_test, regressor.predict(X_test), color='blue')
plt.show()
Beispiel #23
0
def build_ARDRegression(X_train, Y_train, X_tagret, Y_target, target_str):
    clf = ARDRegression()
    clf.fit(X_train, Y_train)
    Y_p = clf.predict(X_tagret)
    Y_e = np.array(Y_target)
    return Y_e, Y_p
Beispiel #24
0
def task2(data):

    df = data

    dfreg = df.loc[:, ['Adj Close', 'Volume']]
    dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)
    # We want to separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(dfreg)))
    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'Adj Close'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))
    # Scale the X so that everyone can have the same distribution for linear regression
    X = preprocessing.scale(X)
    # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ##################
    ##################
    ##################

    # Linear regression
    clfreg = LinearRegression(n_jobs=-1)
    clfreg.fit(X_train, y_train)
    # Quadratic Regression 2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
    clfpoly2.fit(X_train, y_train)

    # Quadratic Regression 3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
    clfpoly3.fit(X_train, y_train)

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)
    clfknn.fit(X_train, y_train)

    # Lasso Regression
    clflas = Lasso()
    clflas.fit(X_train, y_train)

    # Multitask Lasso Regression
    # clfmtl = MultiTaskLasso(alpha=1.)
    # clfmtl.fit(X_train, y_train).coef_

    # Bayesian Ridge Regression
    clfbyr = BayesianRidge()
    clfbyr.fit(X_train, y_train)

    # Lasso LARS Regression
    clflar = LassoLars(alpha=.1)
    clflar.fit(X_train, y_train)

    # Orthogonal Matching Pursuit Regression
    clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
    clfomp.fit(X_train, y_train)

    # Automatic Relevance Determination Regression
    clfard = ARDRegression(compute_score=True)
    clfard.fit(X_train, y_train)

    # Logistic Regression
    # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
    # coefs_ = []
    # for c in cs:
    #   clflgr.set_params(C=c)
    #   clflgr.fit(X_train, y_train)
    #   coefs_.append(clflgr.coef_.ravel().copy())

    clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
    clfsgd.fit(X_train, y_train)

    ##################
    ##################
    ##################

    #Create confindence scores
    confidencereg = clfreg.score(X_test, y_test)
    confidencepoly2 = clfpoly2.score(X_test, y_test)
    confidencepoly3 = clfpoly3.score(X_test, y_test)
    confidenceknn = clfknn.score(X_test, y_test)
    confidencelas = clflas.score(X_test, y_test)
    # confidencemtl = clfmtl.score(X_test, y_test)
    confidencebyr = clfbyr.score(X_test, y_test)
    confidencelar = clflar.score(X_test, y_test)
    confidenceomp = clfomp.score(X_test, y_test)
    confidenceard = clfard.score(X_test, y_test)
    confidencesgd = clfsgd.score(X_test, y_test)

    # results
    print('The linear regression confidence is:', confidencereg * 100)
    print('The quadratic regression 2 confidence is:', confidencepoly2 * 100)
    print('The quadratic regression 3 confidence is:', confidencepoly3 * 100)
    print('The knn regression confidence is:', confidenceknn * 100)
    print('The lasso regression confidence is:', confidencelas * 100)
    # print('The lasso regression confidence is:',confidencemtl*100)
    print('The Bayesian Ridge regression confidence is:', confidencebyr * 100)
    print('The Lasso LARS regression confidence is:', confidencelar * 100)
    print('The OMP regression confidence is:', confidenceomp * 100)
    print('The ARD regression confidence is:', confidenceard * 100)
    print('The SGD regression confidence is:', confidencesgd * 100)

    #Create new columns
    forecast_reg = clfreg.predict(X_lately)
    forecast_pol2 = clfpoly2.predict(X_lately)
    forecast_pol3 = clfpoly3.predict(X_lately)
    forecast_knn = clfknn.predict(X_lately)
    forecast_las = clflas.predict(X_lately)
    forecast_byr = clfbyr.predict(X_lately)
    forecast_lar = clflar.predict(X_lately)
    forecast_omp = clfomp.predict(X_lately)
    forecast_ard = clfard.predict(X_lately)
    forecast_sgd = clfsgd.predict(X_lately)

    #Process all new columns data
    dfreg['Forecast_reg'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
        dfreg['Forecast_reg'].loc[next_date] = i

    dfreg['Forecast_pol2'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol2:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol2'].loc[next_date] = i

    dfreg['Forecast_pol3'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol3:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol3'].loc[next_date] = i

    dfreg['Forecast_knn'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_knn:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_knn'].loc[next_date] = i

    dfreg['Forecast_las'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_las:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_las'].loc[next_date] = i

    dfreg['Forecast_byr'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_byr:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_byr'].loc[next_date] = i

    dfreg['Forecast_lar'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_lar:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_lar'].loc[next_date] = i

    dfreg['Forecast_omp'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_omp:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_omp'].loc[next_date] = i

    dfreg['Forecast_ard'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_ard:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_ard'].loc[next_date] = i

    dfreg['Forecast_sgd'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_sgd:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_sgd'].loc[next_date] = i

    return dfreg.index.format(formatter=lambda x: x.strftime(
        '%Y-%m-%d')), dfreg['Adj Close'].to_list(
        ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(
        ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(
        ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(
        ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(
        ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
Beispiel #25
0
    os.chdir(folder)
    name_folder = folder.split("/")[6]
    train_data = np.array(pd.read_csv('train_data.csv', sep=';'))
    test_data = np.array(pd.read_csv('test_data.csv', sep=';'))
    train_labels = np.array(pd.read_csv('train_labels.csv', sep=';'))
    test_labels = np.array(pd.read_csv('test_labels.csv', sep=';'))

    inicio = time.time()

    from sklearn.linear_model import ARDRegression

    # treinar o modelo no conjunto de dados
    regression = ARDRegression().fit(train_data, train_labels)

    # prever
    predictions_labels = regression.predict(test_data)

    fim = time.time()
    df_time = pd.DataFrame({'Execution Time:': [fim - inicio]})

    output_path = os.path.join(
        '/home/isadorasalles/Documents/Regressao/bayesian_ARD',
        'time_' + name_folder)
    df_time.to_csv(output_path, sep=';')

    from sklearn import metrics

    df_metrics = pd.DataFrame({
        'Mean Absolute Error':
        [metrics.mean_absolute_error(test_labels, predictions_labels)],
        'Mean Squared Error':
Beispiel #26
0
class ARDClass:
    """
    Name      : ARDRegression
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'ard'

        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/regression/resource/regression_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = ARDRegression(normalize=True)

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model,
                                                     'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}_rg.pkl',
                    self._f_path +
                    f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Beispiel #27
0
print('Variance score: %.2f' % r2_score(y_test, y_pred))

plt.scatter([i for i in range(1, len(y_test) + 1)], y_test)
plt.plot(y_pred)
plt.show()
'''
Automatic Relevance Determination Regression (ARD)
'''

print("\n\nAutomatic Relevance Determination Regression (ARD)\n\n")

from sklearn.linear_model import ARDRegression

regr = ARDRegression(compute_score=True)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.5f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

plt.scatter([i for i in range(1, len(y_test) + 1)], y_test)
plt.plot(y_pred)
plt.show()
'''
Passive Aggressive Regressor
'''

from sklearn.linear_model import PassiveAggressiveRegressor
plt.plot(clf.scores_, color='navy', linewidth=2)
plt.ylabel("Score")
plt.xlabel("Iterations")


# Plotting some predictions for polynomial regression
def f(x, noise_amount):
    y = np.sqrt(x) * np.sin(x)
    noise = np.random.normal(0, 1, len(x))
    return y + noise_amount * noise


degree = 10
X = np.linspace(0, 10, 100)
y = f(X, noise_amount=1)
clf_poly = ARDRegression(threshold_lambda=1e5)
clf_poly.fit(np.vander(X, degree), y)

X_plot = np.linspace(0, 11, 25)
y_plot = f(X_plot, noise_amount=0)
y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
plt.figure(figsize=(6, 5))
plt.errorbar(X_plot, y_mean, y_std, color='navy',
             label="Polynomial ARD", linewidth=2)
plt.plot(X_plot, y_plot, color='gold', linewidth=2,
         label="Ground Truth")
plt.ylabel("Output y")
plt.xlabel("Feature X")
plt.legend(loc="lower left")
plt.show()
Beispiel #29
0
    # number of PCAs to be used, at maximum n_good_training - 2 
    n_components = n_valid_abd - 2
    X = pca.components_[0:n_components,:]
    X = np.transpose(X)

    # selection of X training from X
    X_training = pca.components_[0:n_components,0:n_training]  
    X_training = X_training[:,index]
    X_training = np.reshape(X_training, (n_components, n_valid_abd))
    X_training = np.transpose(X_training)

    # real, regression comes here
    clf = ARDRegression(compute_score=True)
    clf.fit(X_training,y_training)

    pca_fitted_metal[:,jj]  = clf.predict(X)

print('# ++++++++++++++++++++++++++++++++++++++',file=f)
print('# Derived abundances for the TRAINING stars',file=f)
print('#', code[:], file=f)
for ii in range(0, n_training):
    y = []
    arr = np.array(pca_fitted_metal[ii,:])
    for jj in arr: 
        x = '{:9.2f}'.format(jj)
        y.append(x)
    print (y,file=f)


print('# ++++++++++++++++++++++++++++++++++++++',file=f)
print('# Derived abundances for the PROBLEM stars',file=f)
    feature_list.index('COVER_80P_NB_ART'),
    feature_list.index('ranking'),
    #feature_list.index('lag30_np'),
    #feature_list.index('lag3_np'),
    feature_list.index('lag3_pv'),
    feature_list.index('week'),
]

train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train
from sklearn.linear_model import ARDRegression, LinearRegression
clf_train = ARDRegression(compute_score=True)
clf_train.fit(train_important, train_labels)

predictions = clf_train.predict(test_important)

errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
MAPE = np.mean(mape)
MAPE

###################" use the same variables for the new algorithm containing all (train and test = present !!)"###################
present_important = final_data_train_test.copy()
present_important = present_important.drop('reel_ap', axis=1)
present_important = present_important.drop('date', axis=1)
present_important = present_important.drop('nb_ajout_panier', axis=1)

futur_important = final_data_forecast.copy()
        rescaledExpressionClinical = L2Normalizer.transform(np.log10(rescaledExpressionClinical+1))
#    else:
#        prunedRnaSeqExpressionNormalized, L2Normalizer = standardizeExpression(prunedRnaSeqExpression.ix[cellExpression.shape[0],;], L2Normalizer, log10Normalize)
#        prunedArrayExpressionNormalized = L2Normalizer.transform(np.log10(prunedRescaledExpressionClinical+1))

    #Load Docetaxel IC50 Data
    docetaxelData = getDrugIC50('Docetaxel', inputFolder)
    
    #Assemble training data with both IC50 and expression data    
    docetaxelData = pd.merge(docetaxelData, rnaSeqExpressionNormalized, how='inner', left_index=True, right_index=True).drop('cell_line', axis=1)
        
    #Train Docetaxel model    
    clf.fit(docetaxelData.drop(['IC50'], axis=1), docetaxelData['IC50'])    
    
    #Validate on Clinical Data
    resistance_predictions = clf.predict(rescaledExpressionClinical)
    
    #Calculates ROC, first 11 samples correspond to sensitive patients, last 13 are resistant            
    roc_auc_score(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions)

    roc_data = pd.DataFrame()
    roc_data['fpr'], roc_data['tpr'],roc_data['thresholds'] = roc_curve(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions)


    #Plot Results
    from bokeh.charts import show, output_file
    from bokeh.plotting import figure

    output_file(outputFolder + 'Docetaxel_ROC_Curve_rankIC50.html')
        
Beispiel #32
0
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, bayesianRidge.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试ARDRegression类**********"
    ardRegression = ARDRegression()
    # 拟合训练集
    ardRegression.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    print "系数:", ardRegression.coef_
    print "截距:", ardRegression.intercept_
    print '训练集R2: ', r2_score(train_Y, ardRegression.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = ardRegression.predict(test_X)
    print "测试集得分:", ardRegression.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, ardRegression.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

# Plotting some predictions for polynomial regression
def f(x, noise_amount):
    y = np.sqrt(x) * np.sin(x)
    noise = np.random.normal(0, 1, len(x))
    return y + noise_amount * noise


degree = 10
X = np.linspace(0, 10, 100)
y = f(X, noise_amount=1)
clf_poly = ARDRegression(threshold_lambda=1e5)
# np.vander范德蒙行列式  第e行是c₁,c₂,…,cₑ的e-1次幂。
clf_poly.fit(np.vander(X, degree), y)

X_plot = np.linspace(0, 11, 25)
y_plot = f(X_plot, noise_amount=0)
y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
plt.figure(figsize=(6, 5))
plt.errorbar(X_plot,
             y_mean,
             y_std,
             color='navy',
             label="Polynomial ARD",
             linewidth=2)
plt.plot(X_plot, y_plot, color='gold', linewidth=2, label="Ground Truth")
plt.ylabel("Output y")
plt.xlabel("Feature X")
plt.legend(loc="lower left")
plt.show()
from sklearn.datasets import load_boston
from sklearn.metrics import explained_variance_score, mean_squared_error
import numpy as np
import pylab as pl
#Loading boston datasets 
boston = load_boston()
# Creating Regression Design Matrix 
x = boston.data
# Creating target dataset
y = boston.target
# Create ARDRegression Regression object 
ARD= ARDRegression(alpha_1=0.01, alpha_2=0.01, lambda_1=1e-06, lambda_2=1e-06)
# Fitting a linear model using the dataset
ARD.fit(x,y)
# Y predicted values
yp = ARD.predict(x)
#Calculation 10-Fold CV
yp_cv = cross_val_predict(ARD, x, y, cv=10)
#Printing RMSE and Explained Variance
Evariance=explained_variance_score(y,yp)
Evariance_cv=explained_variance_score(y,yp_cv)
RMSE =np.sqrt(mean_squared_error(y,yp))
RMSECV=np.sqrt(mean_squared_error(y,yp_cv))
print('Method: ARDRegression Regression')
print('RMSE on the dataset: %.4f' %RMSE)
print('RMSE on 10-fold CV: %.4f' %RMSECV)
print('Explained Variance Regression Score on the dataset: %.4f' %Evariance)
print('Explained Variance Regression 10-fold CV: %.4f' %Evariance_cv)
#plotting real vs predicted data
pl.figure(1)
pl.plot(yp, y,'ro')
Beispiel #35
0
            alpha=0.5,
            color='blue')
plt.title('grid search')
plt.xlim(-10, 50)
plt.ylim(-10, 50)
plt.show()

# ## ARD регрессия

# In[356]:

ard = ARDRegression()
ard.fit(X_train, y_train)
np.random.seed(0)
print("среднее значение отклика обучающей выборки: %f" %
      np.mean(ard.predict(X_train)))
print(
    "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, обучение: %f"
    % np.sqrt(mean_squared_error(ard.predict(X_train), y_train)))
print(
    "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, тест: %f"
    % np.sqrt(mean_squared_error(ard.predict(X_test), y_test)))
print('коэффициент детерминации: %f' % ard.score(X_test, y_test))
print('абсолютная ошибка: %f' %
      mean_absolute_error(y_test, ard.predict(X_test)))

# In[357]:

print(list(np.array(y_test)[:10]))
print(list(map(lambda x: int(round(x)), (ard.predict(X_test))[:10])))