def test_ard_regression_predict_normalize_true(): """Check that we can predict with `normalize=True` and `return_std=True`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/18605 """ clf = ARDRegression(normalize=True) clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) clf.predict([[1, 1]], return_std=True)
def runARDRegressor(self): lm = ARDRegression(fit_intercept=True, normalize=True) print("runARDRegressor\n") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def test_update_of_sigma_in_ard(): # Checks that `sigma_` is updated correctly after the last iteration # of the ARDRegression algorithm. See issue #10128. X = np.array([[1, 0], [0, 0]]) y = np.array([0, 0]) clf = ARDRegression(n_iter=1) clf.fit(X, y) # With the inputs above, ARDRegression prunes both of the two coefficients # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0). assert clf.sigma_.shape == (0, 0) # Ensure that no error is thrown at prediction stage clf.predict(X, return_std=True)
def _ard(*, train, test, x_predict=None, metrics, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, threshold_lambda=10000.0, fit_intercept=True, normalize=False, copy_X=True, verbose=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html#sklearn.linear_model.ARDRegression """ model = ARDRegression(n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, compute_score=compute_score, threshold_lambda=threshold_lambda, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose) model.fit(train[0], train[1]) model_name = 'ARDRegression' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
class ARDRegressionPrim(primitive): def __init__(self, random_state=0): super(ARDRegressionPrim, self).__init__(name='ARDRegression') self.hyperparams = [] self.type = 'Regressor' self.description = "Bayesian ARD regression. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions. Also estimate the parameters lambda (precisions of the distributions of the weights) and alpha (precision of the distribution of the noise). The estimation is done by an iterative procedures (Evidence Maximization)" self.hyperparams_run = {'default': True} self.random_state = random_state self.model = ARDRegression() self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name + "Pred"]) final_output = {0: output} return final_output
def ard_regression(train, test): train = train.copy() test = test.copy() X = train.to_numpy() X_train = np.delete(X, [train.columns.get_loc('views')], axis=1) y_train = train['views'] X = test.to_numpy() X_test = np.delete(X, [test.columns.get_loc('views')], axis=1) y_test = test['views'] reg = ARDRegression(compute_score=True) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred, squared=True)) # The coefficient of determination: 1 is perfect prediction print('median absolute error: %.2f' % median_absolute_error(y_test, y_pred)) return None
def ARDRegression_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): ''' ''' clf = ARDRegression() clf.fit(X[train], y[train][:, 0]) y_pred = clf.predict(X[test])[:, None] return y_pred, clf
def test_return_std(): # Test return_std option for both Bayesian regressors def f(X): return np.dot(X, w) + b def f_noise(X, noise_mult): return f(X) + np.random.randn(X.shape[0]) * noise_mult d = 5 n_train = 50 n_test = 10 w = np.array([1.0, 0.0, 1.0, -1.0, 0.0]) b = 1.0 X = np.random.random((n_train, d)) X_test = np.random.random((n_test, d)) for decimal, noise_mult in enumerate([1, 0.1, 0.01]): y = f_noise(X, noise_mult) m1 = BayesianRidge() m1.fit(X, y) y_mean1, y_std1 = m1.predict(X_test, return_std=True) assert_array_almost_equal(y_std1, noise_mult, decimal=decimal) m2 = ARDRegression() m2.fit(X, y) y_mean2, y_std2 = m2.predict(X_test, return_std=True) assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
class ARDR(): """docstring for ClassName""" def __init__(self, ARDRegression, N): self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N)) self.selected_columns = [] self.model = ARDRegression( alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, threshold_lambda=10000.0, tol=0.001, verbose=False) print("ARDRegression Cores: ", np.nan) def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"): try: self.selected_columns = np.random.choice(X_train.columns, 100, replace = False) X_train = X_train[self.selected_columns] except Exception as E: X_train = X_train error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" } error_metric = error_dict[error_type] self.model.fit(X_train, y_train ) def predict(self, X_test): prediction=self.model.predict(X_test[self.selected_columns]) return(prediction)
def autorelevancedetermination(self): # Fit the ARD Regression clf = ARDRegression(compute_score=True) clf.fit(self.x_train, self.y_train) z = clf.predict(self.x_test) print(np.mean(self.y_test == z)) return z
def test_toy_ard_object(): # Test BayesianRegression ARD classifier X = np.array([[1], [2], [3]]) Y = np.array([1, 2, 3]) clf = ARDRegression(compute_score=True) clf.fit(X, Y) # Check that the model could approximately learn the identity function test = [[1], [3], [4]] assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
def ARD(X_train, y_train, X_test, y_test): ''' Purpose: Use ARD to calculate accuracy Input: X_train, y_train, X_test, y_test Output: accuracy_score ''' clf = ARDRegression(compute_score=True) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = y_pred.round() #ols = LinearRegression() #ols.fit(X, y) return metrics.accuracy_score(y_test, y_pred)
def fit_model_16(self,toWrite=False): model = ARDRegression() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 16 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model16/model.pkl','w') pickle.dump(model,f2) f2.close()
class _ARDRegressionImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ algorithm = ARDRegression(threshold_lambda=1e5) algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['BR_pca'] = sumsum / float(result_row) rs_score['BR_pca'] = r2_score(y_test, y) BRModel = BayesianRidge() BRModel.fit(X_train_std, y_train) y = BRModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['BR_std'] = sumsum / float(result_row) rs_score['BR_std'] = r2_score(y_test, y) ARDModel = ARDRegression() ARDModel.fit(X_train_pca, y_train) y = ARDModel.predict(X_test_pca) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['ARD_pca'] = sumsum / float(result_row) rs_score['ARD_pca'] = r2_score(y_test, y) ARDModel = ARDRegression() ARDModel.fit(X_train_std, y_train) y = ARDModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
class Bayesian_Linear_Model: """Bayesian linear regression object compatible with the BO framework. Model implemented using scikit-learn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html#sklearn.linear_model.ARDRegression """ def __init__(self, X, y, **kwargs): """ Parameters ---------- X : list, numpy.array, pandas.DataFrame Domain points to be used for model training. y : list, numpy.array, pandas.DataFrame Response values to be used for model training. """ # CV set gamma prior parameters - no GS for now self.alphas = np.logspace(-6, 0.5, 7) # Initialize model self.model = ARDRegression(n_iter=50) # Make sure X and y are numpy arrays self.X = np.array(X) self.y = np.array(y) # Fit def fit(self): """Train the model using grid search CV.""" parameters = [{'alpha_1': self.alphas, 'alpha_2': self.alphas}] # Set the number of folds if len(self.X) < 5: n_folds = len(self.X) else: n_folds = 5 # Run grid search if n_folds > 1: # Select l1 term via grid search self.grid_search = GridSearchCV(self.model, parameters, cv=n_folds, refit=True, n_jobs=-1) self.grid_search.fit(self.X, self.y) # Set model to trained model self.model = self.grid_search.best_estimator_ # Just fit model else: self.model.fit(self.X, self.y) def get_scores(self): """Get grid search cross validation results. Returns ---------- (numpy.array, numpy.array) Average scores and standard deviation of scores for grid. """ # Plot results scores = self.grid_search.cv_results_['mean_test_score'] scores_std = self.grid_search.cv_results_['std_test_score'] return scores, scores_std # Predict def predict(self, points): """Model predictions. Parameters ---------- points : list, numpy.array, pandas.DataFrame Domain points to be evaluated. Returns ---------- numpy.array Predicted response values at points. """ # Make sure points in a numpy array points = np.array(points) # Make predicitons pred = self.model.predict(points) return pred # Regression def regression(self, return_data=False, export_path=None, return_scores=False): """Helper method for visualizing the models regression performance. Generates a predicted vs observed plot using the models training data. Parameters ---------- return_data : bool Return predicted responses. export_path : None, str Export SVG image of predicted vs observed plot to export_path. Returns ---------- matplotlib.pyplot Scatter plot with computed RMSE and R^2. """ pred = self.predict(self.X) obs = self.y return pred_obs(pred, obs, return_data=return_data, export_path=export_path, return_scores=return_scores) # Estimate variance def variance(self, points): """Estimated variance of Bayesian linear model. Parameters ---------- points : numpy.array Domain points to be evaluated. Returns ---------- numpy.array Model variance at points. """ # Make sure points in a numpy array points = np.array(points) # Make predicitons pred, std = self.model.predict(points, return_std=True) return std**2
def ard_regression(data_x,data_y): clf = ARDRegression(compute_score=True) clf.fit(data_x, data_y) predict_x = np.array(range(0,data_x.shape[0])) predict_x = np.reshape(predict_x,(data_x.shape[0],1)) return predict_x,clf.predict(predict_x)
plt.title("Action unit weights") plt.bar(au_coeff.index, au_coeff.values) plt.xlabel("Action units") plt.ylabel("Values of the weights") #smooth function to make plotted data more human readable def smooth(y, box_pts): box = np.ones(box_pts) / box_pts y_smooth = np.convolve(y, box, mode='same') return y_smooth plt.figure(figsize=(6, 5)) plt.title("Predictions") y_predict, y_std = clf.predict(x_valid, return_std=True) axis = np.arange(0, n_samples_valid) plt.plot(axis, y_predict, color='lightsteelblue', linewidth=0.5, linestyle='dotted', markersize=0.8, label="ARD", marker='.') plt.plot(axis, smooth(y_predict, 100), color='navy', label="ARD smoothed") plt.plot(axis, y_valid, color='gold', linewidth=2, label="Ground Truth") plt.xlabel("Samples") plt.ylabel("Valence") plt.legend(loc='upper left', fontsize=8) print("MAE")
gbr.fit(X, y) br = BayesianRidge(compute_score=True) br.fit(X, y) ardr = ARDRegression(compute_score=True) ardr.fit(X, y) knn = neighbors.KNeighborsRegressor(5, weights='distance') knn.fit(X, y) # Predict using kernel ridge X_plot = np.linspace(0, num + 10, 10000)[:, None] y_kr = kr.predict(X_plot) y_svr = svr.predict(X_plot) y_abr = abr.predict(X_plot) y_gbr = gbr.predict(X_plot) y_br = br.predict(X_plot) y_ardr = ardr.predict(X_plot) y_knn = knn.predict(X_plot) # Plot results fig = plt.figure(figsize=(10, 5)) lw = 2 plt.scatter(X, y, c='k', s=5, label='data') plt.plot(X_plot, y_kr, color='turquoise', lw=lw, label='KRR (%s)' % kr.best_params_) # plt.plot(X_plot, y_svr, color='r', lw=lw, label='SVR (%s)' % svr.best_params_) plt.plot(X_plot, y_abr, color='g', lw=lw, label='AdaBoostRegressor') # plt.plot(X_plot, y_gbr, color='k', lw=lw, label='GradientBoostingRegressor') # plt.plot(X_plot, y_br, color='k', lw=lw, label='BayesianRidge')
import pandas as pd import numpy as np import matplotlib.pyplot as plt dataset = pd.read_csv("datasets/studentscores.csv") print dataset.shape X = dataset.iloc[:, :1].values Y = dataset.iloc[:, 1].values from sklearn.cross_validation import train_test_split # random_state 是一种伪随机参数 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) from sklearn.linear_model import ARDRegression regressor = ARDRegression() regressor = regressor.fit(X_train, Y_train) Y_pred = regressor.predict(X_test) plt.scatter(X_train, Y_train, color='red') plt.plot(X_train, regressor.predict(X_train), color='blue') plt.show() plt.scatter(X_test, Y_test, color='red') plt.plot(X_test, regressor.predict(X_test), color='blue') plt.show()
def build_ARDRegression(X_train, Y_train, X_tagret, Y_target, target_str): clf = ARDRegression() clf.fit(X_train, Y_train) Y_p = clf.predict(X_tagret) Y_e = np.array(Y_target) return Y_e, Y_p
def task2(data): df = data dfreg = df.loc[:, ['Adj Close', 'Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:', confidencereg * 100) print('The quadratic regression 2 confidence is:', confidencepoly2 * 100) print('The quadratic regression 3 confidence is:', confidencepoly3 * 100) print('The knn regression confidence is:', confidenceknn * 100) print('The lasso regression confidence is:', confidencelas * 100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:', confidencebyr * 100) print('The Lasso LARS regression confidence is:', confidencelar * 100) print('The OMP regression confidence is:', confidenceomp * 100) print('The ARD regression confidence is:', confidenceard * 100) print('The SGD regression confidence is:', confidencesgd * 100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime( '%Y-%m-%d')), dfreg['Adj Close'].to_list( ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list( ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list( ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list( ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list( ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
os.chdir(folder) name_folder = folder.split("/")[6] train_data = np.array(pd.read_csv('train_data.csv', sep=';')) test_data = np.array(pd.read_csv('test_data.csv', sep=';')) train_labels = np.array(pd.read_csv('train_labels.csv', sep=';')) test_labels = np.array(pd.read_csv('test_labels.csv', sep=';')) inicio = time.time() from sklearn.linear_model import ARDRegression # treinar o modelo no conjunto de dados regression = ARDRegression().fit(train_data, train_labels) # prever predictions_labels = regression.predict(test_data) fim = time.time() df_time = pd.DataFrame({'Execution Time:': [fim - inicio]}) output_path = os.path.join( '/home/isadorasalles/Documents/Regressao/bayesian_ARD', 'time_' + name_folder) df_time.to_csv(output_path, sep=';') from sklearn import metrics df_metrics = pd.DataFrame({ 'Mean Absolute Error': [metrics.mean_absolute_error(test_labels, predictions_labels)], 'Mean Squared Error':
class ARDClass: """ Name : ARDRegression Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'ard' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = ARDRegression(normalize=True) # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename( self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
print('Variance score: %.2f' % r2_score(y_test, y_pred)) plt.scatter([i for i in range(1, len(y_test) + 1)], y_test) plt.plot(y_pred) plt.show() ''' Automatic Relevance Determination Regression (ARD) ''' print("\n\nAutomatic Relevance Determination Regression (ARD)\n\n") from sklearn.linear_model import ARDRegression regr = ARDRegression(compute_score=True) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) print('Coefficients: \n', regr.coef_) # The mean squared error print("Mean squared error: %.5f" % mean_squared_error(y_test, y_pred)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % r2_score(y_test, y_pred)) plt.scatter([i for i in range(1, len(y_test) + 1)], y_test) plt.plot(y_pred) plt.show() ''' Passive Aggressive Regressor ''' from sklearn.linear_model import PassiveAggressiveRegressor
plt.plot(clf.scores_, color='navy', linewidth=2) plt.ylabel("Score") plt.xlabel("Iterations") # Plotting some predictions for polynomial regression def f(x, noise_amount): y = np.sqrt(x) * np.sin(x) noise = np.random.normal(0, 1, len(x)) return y + noise_amount * noise degree = 10 X = np.linspace(0, 10, 100) y = f(X, noise_amount=1) clf_poly = ARDRegression(threshold_lambda=1e5) clf_poly.fit(np.vander(X, degree), y) X_plot = np.linspace(0, 11, 25) y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) plt.errorbar(X_plot, y_mean, y_std, color='navy', label="Polynomial ARD", linewidth=2) plt.plot(X_plot, y_plot, color='gold', linewidth=2, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") plt.show()
# number of PCAs to be used, at maximum n_good_training - 2 n_components = n_valid_abd - 2 X = pca.components_[0:n_components,:] X = np.transpose(X) # selection of X training from X X_training = pca.components_[0:n_components,0:n_training] X_training = X_training[:,index] X_training = np.reshape(X_training, (n_components, n_valid_abd)) X_training = np.transpose(X_training) # real, regression comes here clf = ARDRegression(compute_score=True) clf.fit(X_training,y_training) pca_fitted_metal[:,jj] = clf.predict(X) print('# ++++++++++++++++++++++++++++++++++++++',file=f) print('# Derived abundances for the TRAINING stars',file=f) print('#', code[:], file=f) for ii in range(0, n_training): y = [] arr = np.array(pca_fitted_metal[ii,:]) for jj in arr: x = '{:9.2f}'.format(jj) y.append(x) print (y,file=f) print('# ++++++++++++++++++++++++++++++++++++++',file=f) print('# Derived abundances for the PROBLEM stars',file=f)
feature_list.index('COVER_80P_NB_ART'), feature_list.index('ranking'), #feature_list.index('lag30_np'), #feature_list.index('lag3_np'), feature_list.index('lag3_pv'), feature_list.index('week'), ] train_important = train_features[:, important_indices] test_important = test_features[:, important_indices] # Train from sklearn.linear_model import ARDRegression, LinearRegression clf_train = ARDRegression(compute_score=True) clf_train.fit(train_important, train_labels) predictions = clf_train.predict(test_important) errors = abs(predictions - test_labels) # Display the performance metrics print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.') mape = np.mean(100 * (errors / test_labels)) MAPE = np.mean(mape) MAPE ###################" use the same variables for the new algorithm containing all (train and test = present !!)"################### present_important = final_data_train_test.copy() present_important = present_important.drop('reel_ap', axis=1) present_important = present_important.drop('date', axis=1) present_important = present_important.drop('nb_ajout_panier', axis=1) futur_important = final_data_forecast.copy()
rescaledExpressionClinical = L2Normalizer.transform(np.log10(rescaledExpressionClinical+1)) # else: # prunedRnaSeqExpressionNormalized, L2Normalizer = standardizeExpression(prunedRnaSeqExpression.ix[cellExpression.shape[0],;], L2Normalizer, log10Normalize) # prunedArrayExpressionNormalized = L2Normalizer.transform(np.log10(prunedRescaledExpressionClinical+1)) #Load Docetaxel IC50 Data docetaxelData = getDrugIC50('Docetaxel', inputFolder) #Assemble training data with both IC50 and expression data docetaxelData = pd.merge(docetaxelData, rnaSeqExpressionNormalized, how='inner', left_index=True, right_index=True).drop('cell_line', axis=1) #Train Docetaxel model clf.fit(docetaxelData.drop(['IC50'], axis=1), docetaxelData['IC50']) #Validate on Clinical Data resistance_predictions = clf.predict(rescaledExpressionClinical) #Calculates ROC, first 11 samples correspond to sensitive patients, last 13 are resistant roc_auc_score(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions) roc_data = pd.DataFrame() roc_data['fpr'], roc_data['tpr'],roc_data['thresholds'] = roc_curve(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions) #Plot Results from bokeh.charts import show, output_file from bokeh.plotting import figure output_file(outputFolder + 'Docetaxel_ROC_Curve_rankIC50.html')
print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, bayesianRidge.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试ARDRegression类**********" ardRegression = ARDRegression() # 拟合训练集 ardRegression.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", ardRegression.coef_ print "截距:", ardRegression.intercept_ print '训练集R2: ', r2_score(train_Y, ardRegression.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = ardRegression.predict(test_X) print "测试集得分:", ardRegression.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, ardRegression.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2
# Plotting some predictions for polynomial regression def f(x, noise_amount): y = np.sqrt(x) * np.sin(x) noise = np.random.normal(0, 1, len(x)) return y + noise_amount * noise degree = 10 X = np.linspace(0, 10, 100) y = f(X, noise_amount=1) clf_poly = ARDRegression(threshold_lambda=1e5) # np.vander范德蒙行列式 第e行是c₁,c₂,…,cₑ的e-1次幂。 clf_poly.fit(np.vander(X, degree), y) X_plot = np.linspace(0, 11, 25) y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) plt.errorbar(X_plot, y_mean, y_std, color='navy', label="Polynomial ARD", linewidth=2) plt.plot(X_plot, y_plot, color='gold', linewidth=2, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") plt.show()
from sklearn.datasets import load_boston from sklearn.metrics import explained_variance_score, mean_squared_error import numpy as np import pylab as pl #Loading boston datasets boston = load_boston() # Creating Regression Design Matrix x = boston.data # Creating target dataset y = boston.target # Create ARDRegression Regression object ARD= ARDRegression(alpha_1=0.01, alpha_2=0.01, lambda_1=1e-06, lambda_2=1e-06) # Fitting a linear model using the dataset ARD.fit(x,y) # Y predicted values yp = ARD.predict(x) #Calculation 10-Fold CV yp_cv = cross_val_predict(ARD, x, y, cv=10) #Printing RMSE and Explained Variance Evariance=explained_variance_score(y,yp) Evariance_cv=explained_variance_score(y,yp_cv) RMSE =np.sqrt(mean_squared_error(y,yp)) RMSECV=np.sqrt(mean_squared_error(y,yp_cv)) print('Method: ARDRegression Regression') print('RMSE on the dataset: %.4f' %RMSE) print('RMSE on 10-fold CV: %.4f' %RMSECV) print('Explained Variance Regression Score on the dataset: %.4f' %Evariance) print('Explained Variance Regression 10-fold CV: %.4f' %Evariance_cv) #plotting real vs predicted data pl.figure(1) pl.plot(yp, y,'ro')
alpha=0.5, color='blue') plt.title('grid search') plt.xlim(-10, 50) plt.ylim(-10, 50) plt.show() # ## ARD регрессия # In[356]: ard = ARDRegression() ard.fit(X_train, y_train) np.random.seed(0) print("среднее значение отклика обучающей выборки: %f" % np.mean(ard.predict(X_train))) print( "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, обучение: %f" % np.sqrt(mean_squared_error(ard.predict(X_train), y_train))) print( "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, тест: %f" % np.sqrt(mean_squared_error(ard.predict(X_test), y_test))) print('коэффициент детерминации: %f' % ard.score(X_test, y_test)) print('абсолютная ошибка: %f' % mean_absolute_error(y_test, ard.predict(X_test))) # In[357]: print(list(np.array(y_test)[:10])) print(list(map(lambda x: int(round(x)), (ard.predict(X_test))[:10])))