def train(self, epoch_count=40, learning_rate=0.06): self.regressor = SGDRegressor(max_iter=epoch_count, tol=1e-5, learning_rate='constant', eta0=learning_rate) # Running the training by calling the library method self.regressor.fit(self.X_train, self.Y_train)
def __init__( self, timeseries, dataname, # parameter for SGD regression loss="squared_loss", penalty="l2", alpha=0.0001, fit_intercept=True, tol=1e-3, learning_rate="invscaling", l1_ratio=0.15, epsilon=0.1, max_iter=1000, eta0=0.01, power_t=0.5, average=False, #feature extraction parameter Window_size=20, Difference=False, time_feature=True, tsfresh_feature=True, forecasting_steps=25, n_splits=5, max_train_size=None, NAN_threshold=0.05): self.loss = loss self.penalty = penalty self.alpha = float(alpha) self.fit_intercept = fit_intercept self.tol = float(tol) self.learning_rate = learning_rate self.l1_ratio = float(l1_ratio) if l1_ratio is not None else 0.15 self.epsilon = float(epsilon) if epsilon is not None else 0.1 self.max_iter = int(max_iter) self.eta0 = float(eta0) self.power_t = float(power_t) if power_t is not None else 0.25 self.average = average self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, average=self.average, warm_start=True) super().__init__(timeseries, dataname, Window_size, time_feature, Difference, tsfresh_feature, forecasting_steps, n_splits, max_train_size, NAN_threshold)
def test_partial_fit(self): # define an online pipeline piple = OnlinePipeline([ ('scale', StandardScaler()), ('clf', SGDRegressor(random_state=5, shuffle=False, verbose=True, max_iter=10)), ]) # define an offline pipelines pipl = Pipeline([ ('scale', StandardScaler()), ('clf', SGDRegressor(random_state=5, shuffle=False, verbose=True, max_iter=20)), ]) # generate some data X, y = make_regression(100, 100, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42) # fit, predict in an online manner for i in range(100): piple.partial_fit(X_train[0:30], y_train[0:30]) piple.partial_fit(X_train[30:], y_train[30:]) ye = piple.predict(X_test) # fit, predict in offline manner pipl.fit(X_train, y_train) yh = pipl.predict(X_test) # compare results r2_offline = r2_score(y_test, yh) r2_online = r2_score(y_test, ye) # use a relatively high tolerance due to differences in results of offline and online np.testing.assert_allclose(r2_offline, r2_online, atol=0.1)
def test_partial_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']][0:2] Y = df[['y']][0:2] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(df[['x']], 'datax-full') om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega # -- ignore warnings on y shape import warnings warnings.filterwarnings("ignore", category=DataConversionWarning) lr = SGDRegressor() om.models.put(lr, 'mymodel2') # have Omega fit the model to get a start, then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references result = om.runtime.model('mymodel2').predict('datax-full') pred1 = result.get() mse = mean_squared_error(df.y, pred1) self.assertGreater(mse, 90) # fit mini batches add better training data, update model batch_size = 2 for i, start in enumerate(range(0, len(df))): previous_mse = mse X = df[['x']][start:start + batch_size] Y = df[['y']][start:start + batch_size] om.datasets.put(X, 'datax-update', append=False) om.datasets.put(Y, 'datay-update', append=False) result = om.runtime.model('mymodel2').partial_fit( 'datax-update', 'datay-update') result.get() # check the new model version metadata includes the datax/y # references result = om.runtime.model('mymodel2').predict('datax-full') pred1 = result.get() mse = mean_squared_error(df.y, pred1) self.assertLess(mse, previous_mse) # mse == 0 is most accurate the best self.assertLess(mse, 1.0)
def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.linear_model.stochastic_gradient import SGDRegressor import sklearn.preprocessing if refit: self.estimator = None self.scaler = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=n_iter, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.scaler = sklearn.preprocessing.StandardScaler(copy=True) self.scaler.fit(y.reshape((-1, 1))) else: self.estimator.n_iter += n_iter Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel() self.estimator.partial_fit(X, Y_scaled) if self.estimator.n_iter >= self.n_iter: self.fully_fit_ = True return self
def demo(output_file=None, instances=40000): """ _test_regression This demo demonstrates how to evaluate a regressor. The data stream used is an instance of the RegressionGenerator, which feeds an instance from sklearn's SGDRegressor. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) #stream = FileStream(opt, -1, 1) #stream = WaveformGenerator() #stream.prepare_for_use() stream = RegressionGenerator(n_samples=40000) # Setup the classifier #classifier = SGDClassifier() #classifier = PassiveAggressiveClassifier() classifier = SGDRegressor() #classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential(pretrain_size=1, max_instances=instances, batch_size=1, n_wait=1, max_time=1000, output_file=output_file, task_type='regression', show_plot=True, plot_options=['true_vs_predicts']) # Evaluate eval.eval(stream=stream, classifier=pipe)
def demo(output_file=None, instances=40000): """ _test_regression This demo demonstrates how to evaluate a regressor. The data stream used is an instance of the RegressionGenerator, which feeds an instance from sklearn's SGDRegressor. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # stream = FileStream("../data/datasets/covtype.csv", -1, 1) # stream = WaveformGenerator() # stream.prepare_for_use() stream = RegressionGenerator(n_samples=40000) # Setup the classifier # classifier = SGDClassifier() # classifier = PassiveAggressiveClassifier() classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=1, max_samples=instances, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, show_plot=True, metrics=['mean_square_error']) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None self.scaler = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.scaler = sklearn.preprocessing.StandardScaler(copy=True) self.scaler.fit(y) Y_scaled = self.scaler.transform(y) self.estimator.n_iter += n_iter self.estimator.fit(X, Y_scaled) return self
from sklearn.linear_model.stochastic_gradient import SGDRegressor x_train = [[1, 0., 3], [1, 1., 3], [1, 2., 3], [1, 3., 2], [1, 4., 4]] y_train = [95.364, 97.217205, 75.195834, 60.105519, 49.342380] model = SGDRegressor(max_iter=5000000, alpha=0.00001) #[ 45.71878249 -13.02758034 1.14608487] model.fit(x_train, y_train) print(model.coef_) print(model.intercept_)
"neighbor_" + str(x + 1) for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv") build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True) build_housing( MLPRegressor(activation="tanh", hidden_layer_sizes=(26, ), algorithm="l-bfgs", random_state=13, tol=0.001, max_iter=1000), "MLPHousing") build_housing(SGDRegressor(random_state=13), "SGDHousing") build_housing(SVR(), "SVRHousing", to_sparse=True) build_housing(LinearSVR(random_state=13), "LinearSVRHousing", to_sparse=True) build_housing(NuSVR(), "NuSVRHousing", to_sparse=True) housing_df = housing_df.drop("MEDV", axis=1) housing_anomaly_columns = housing_df.columns.tolist() housing_anomaly_mapper = DataFrameMapper([ (housing_anomaly_columns, [ContinuousDomain(), MaxAbsScaler()]) ]) housing_anomaly_X = housing_anomaly_mapper.fit_transform(housing_df)
np.random.shuffle(inds) coef[inds[n_features / 2:]] = 0 # sparsify coef print("true coef sparsity: %f" % sparsity_ratio(coef)) y = np.dot(X, coef) # add noise y += 0.01 * np.random.normal((n_samples, )) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[:n_samples / 2], y[:n_samples / 2] X_test, y_test = X[n_samples / 2:], y[n_samples / 2:] print("test data sparsity: %f" % sparsity_ratio(X_test)) ############################################################################### clf = SGDRegressor(penalty='l1', alpha=.2, fit_intercept=True, n_iter=2000) clf.fit(X_train, y_train) print("model sparsity: %f" % sparsity_ratio(clf.coef_)) @profile def benchmark_dense_predict(): for _ in range(300): clf.predict(X_test) @profile def benchmark_sparse_predict(): X_test_sparse = csr_matrix(X_test) for _ in range(300): clf.predict(X_test_sparse)
# ############################################################################# # Benchmark bulk/atomic prediction speed for various regressors configuration = { 'n_train': int(1e3), 'n_test': int(1e2), 'n_features': int(1e2), 'estimators': [ { 'name': 'Linear Model', 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'),
start_time = time.time() configuration = { 'n_train': int(1e3), 'n_test': int(1e2), 'n_features': int(1e2), 'estimators': [{ 'name': "Linear Model", 'instance': SGDRegressor(penalty='l2', alpha=0.1, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimator', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_) }]
# ############################################################################# # Main code start_time = time.time() # ############################################################################# # Benchmark bulk/atomic prediction speed for various regressors configuration = { 'n_train': int(1e3), 'n_test': int(1e2), 'n_features': int(1e2), 'estimators': [ {'name': 'Linear Model', 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)}, {'name': 'RandomForest', 'instance': RandomForestRegressor(n_estimators=100), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators}, {'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_)}, ] } benchmark(configuration)
def iterative_fit(self, X, y, n_iter=2, refit=False): from sklearn.linear_model.stochastic_gradient import SGDRegressor import sklearn.preprocessing # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. n_iter = max(n_iter, 2) if refit: self.estimator = None self.scaler = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.tol = float(self.tol) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.scaler = sklearn.preprocessing.StandardScaler(copy=True) self.scaler.fit(y.reshape((-1, 1))) Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel() self.estimator.fit(X, Y_scaled) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel() self.estimator._validate_params() self.estimator._partial_fit( X, Y_scaled, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self
#%%观察数据 # 定义绘图辅助函数 def plt_helper(label, title, xlabel='x 轴', ylabel='y 轴'): fig = plt.figure() ax = fig.add_subplot(111, label=label) ax.set_title(title, fontproperties=myfont) ax.set_xlabel(xlabel, fontproperties=myfont) ax.set_ylabel(ylabel, fontproperties=myfont) ax.grid(True) return ax ax1 = plt_helper('ax1', '观察模拟数据的分布') ax1.plot(X[:, 0], y, 'r*') #%% linear_SGD = SGDRegressor(loss='squared_loss', max_iter=100) linear_SGD.fit(train_x, train_y) y_SGD = linear_SGD.predict(test_x) linear_rg = LinearRegression( fit_intercept=True, #计算截距 normalize=False, #回归之前不对数据集进行规范化处理 copy_X=True, #复制X,不会对X的原始值产生影响 n_jobs=-1) #使用所有的CPU linear_rg.fit(train_x, train_y) y_rg = linear_rg.predict(test_x) print('模拟数据参数', coef) print('SGDRegressor模型参数', linear_SGD.coef_) print('LinearRegression模型参数', linear_rg.coef_)
]) + 1 #check how far is that index in the dropdown list and return that value def average_lowest_correct(list_of_trues, list_of_preds): length = len(list_of_trues) # number of data points return np.mean([ lowest_correct(list(list_of_trues.iloc[i]), list(list_of_preds[i])) for i in range(length) ]) # Top four models selected formatted as a pipteline to be used for gridsearch model_1 = Pipeline([('md1', MultiOutputRegressor(Ridge()))]) model_2 = Pipeline([('md2', MultiOutputRegressor(KernelRidge()))]) model_3 = Pipeline([('md3', MultiOutputRegressor(LinearSVR()))]) model_4 = Pipeline([('md4', MultiOutputRegressor(SGDRegressor()))]) # Dictionary of all the variable hyperparameters for all four models. Except of the SGD regressor, the hyperparameter list is complete. model_params = { 'Multi_Ridge': { 'model': model_1, 'params': { 'md1__estimator__normalize': [True, False], 'md1__estimator__fit_intercept': [True, False], 'md1__estimator__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 'md1__estimator__alpha': [i for i in range(10, 110, 10)], 'md1__estimator__max_iter': [1000, 2000, 3000] } }, 'Multi_KernelRidge': {
print("true coef sparsity: %f" % sparsity_ratio(coef)) y = np.dot(X, coef) # add noise y += 0.01 * np.random.normal((n_samples, )) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] print("test data sparsity: %f" % sparsity_ratio(X_test)) ############################################################################### clf = SGDRegressor(penalty='l1', alpha=.2, fit_intercept=True, max_iter=2000, tol=None) clf.fit(X_train, y_train) print("model sparsity: %f" % sparsity_ratio(clf.coef_)) def benchmark_dense_predict(): for _ in range(300): clf.predict(X_test) def benchmark_sparse_predict(): X_test_sparse = csr_matrix(X_test) for _ in range(300): clf.predict(X_test_sparse)
def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDRegressor # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) if not check_none(self.epsilon_insensitive): self.epsilon_insensitive = float(self.epsilon_insensitive) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon_huber = float(self.epsilon_huber) if self.epsilon_huber is not None \ else 0.1 self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01 self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) if self.loss == "huber": epsilon = self.epsilon_huber elif self.loss in [ "epsilon_insensitive", "squared_epsilon_insensitive" ]: epsilon = self.epsilon_insensitive else: epsilon = None self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, coef_init=None, intercept_init=None) if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self
# Apply scaler on training and test data standardized_X_train = X_scaler.transform(X_train) standardized_y_train = y_scaler.transform(y_train.values.reshape(-1, 1)).ravel() standardized_X_test = X_scaler.transform(X_test) standardized_y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel() # Check print("mean:", np.mean(standardized_X_train, axis=0), np.mean(standardized_y_train, axis=0)) # mean should be ~0 print("std:", np.std(standardized_X_train, axis=0), np.std(standardized_y_train, axis=0)) # std should be 1 # Initialize the model lm = SGDRegressor(loss="squared_loss", penalty="none", max_iter=args.num_epochs) # Train lm.fit(X=standardized_X_train, y=standardized_y_train) # Predictions (unstandardize them) pred_train = (lm.predict(standardized_X_train) * np.sqrt(y_scaler.var_)) + y_scaler.mean_ pred_test = (lm.predict(standardized_X_test) * np.sqrt(y_scaler.var_)) + y_scaler.mean_ # Train and test MSE train_mse = np.mean((y_train - pred_train)**2) test_mse = np.mean((y_test - pred_test)**2) print("train_MSE: {0:.2f}, test_MSE: {1:.2f}".format(train_mse, test_mse))
y = scaler.fit_transform(Y) ''' regularized training error given by: E(w, b) = 1/n * sum(L(yi, f(xi))) + alpha * R(w) Note: L is loss function, R(w) is regularization term (penalty) For Elastic Net R(w): R(w) = p/2 * sum(wi^2) + (1 - p) * |wi| where p is given by 1 - l1_ratio For inverse scaling learning_rate: lr = eta0 / t^power_t ''' regr = SGDRegressor(penalty = 'elasticnet', alpha = 0.0001, l1_ratio = 0.25, learning_rate = 'invscaling', eta0 = 0.01, power_t = 0.25, loss = 'epsilon_insensitive', epsilon = 0.1, shuffle = True, fit_intercept = True, n_iter = 1000000, average = False, verbose = 0) regr.fit(x, y) data_pred = regr.predict(x) y_pred = scaler.inverse_transform(data_pred) print('coefficients: \n', regr.coef_) #if data is expected to be already centered then intercept_ is not needed print('intercept: \n', regr.intercept_) #Calculate mean squared error print('Mean Squared Error: %.4f' % mean_squared_error(y, data_pred))
from sklearn.preprocessing import StandardScaler #%% PART I # load ex1data1.txt - linear regression with one parameter data1=pd.read_csv("data/ex1data1.txt",names=["X","y"]) x=data1.X.values[:,None] y=data1.y.values poly=PolynomialFeatures(1) X=poly.fit_transform(x) #%% use sklearn # pick models regr_gd=SGDRegressor(fit_intercept=False,alpha=0.00001,max_iter=10000) regr_lr=LinearRegression(fit_intercept=False) # feed data regr_gd.fit(X,y) regr_lr.fit(X,y) #%% plot the solution via the Gradient Decent ind=x.argsort(axis=0).flatten() fig,ax=plt.subplots() # create empty figure plt.plot(x,y,'rx',label='Training data') plt.plot(x[ind],X[ind,:].dot(regr_lr.coef_),'-k',label='lin. reg. (sklearn)') plt.plot(x[ind],X[ind,:].dot(regr_gd.coef_),'-b',label='stoch. grad. descent (sklearn)') ax.set_xlabel("Population of City in 10,000s")
# benchmark bulk/atomic prediction speed for various regressors configuration = { 'n_train': int(1e3), 'n_test': int(1e2), 'n_features': int(1e2), 'estimators': [ { 'name': 'Linear Model', 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'),
def __init__(self, data): self.df = pd.read_csv(data) self.regressor = SGDRegressor(max_iter=40, tol=1e-5, learning_rate='constant', eta0=0.06)
x_scaler = StandardScaler().fit(x_train) y_scaler = StandardScaler().fit(y_train.values.reshape(-1, 1)) #transform: 执行数据标准化 #测试数据和预测数据的标准化的方式要和训练数据标准化的方式一样, 必须用同一个scaler来进行transform #数据标准化公式,每个数据均是 (x - 平均值)/标准差 standardized_x_train = x_scaler.transform(x_train) standardized_y_train = y_scaler.transform(y_train.values.reshape(-1, 1)).ravel() standardized_x_test = x_scaler.transform(x_test) standardized_y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel() #loss:要是用的损失函数,默认是squared_loss,方差拟合 #penalty:使用的惩罚 lm = SGDRegressor(loss='squared_loss', penalty='none', max_iter=args.num_epochs) #使用梯度下降模型 lm.fit(X=standardized_x_train, y=standardized_y_train) #predict:进行数据预测 #scaler.var_是方差,np.sqrt(y_scaler.var_)就是标准差,y_scaler.scale_也是标准差,是一样的 #实际输出结果就是 (模拟的输出结果 * 标准差) + 平均数 和标准化过程刚好相反 pred_train = (lm.predict(standardized_x_train) * y_scaler.scale_) + y_scaler.mean_ pred_test = (lm.predict(standardized_x_test) * np.sqrt(y_scaler.var_)) + y_scaler.mean_ #测试我们自己的数据 X_infer = np.array((0, 1, 2), dtype=np.float32) standardized_X_infer = x_scaler.transform(X_infer.reshape(-1, 1)) pred_infer = (lm.predict(standardized_X_infer) *
np.random.shuffle(inds) coef[inds[n_features // 2:]] = 0 # sparsify coef print("true coef sparsity: %f" % sparsity_ratio(coef)) y = np.dot(X, coef) # add noise y += 0.01 * np.random.normal((n_samples, )) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] print("test data sparsity: %f" % sparsity_ratio(X_test)) ############################################################################### clf = SGDRegressor(penalty='l1', alpha=.2, max_iter=2000, tol=None) clf.fit(X_train, y_train) print("model sparsity: %f" % sparsity_ratio(clf.coef_)) def benchmark_dense_predict(): for _ in range(300): clf.predict(X_test) def benchmark_sparse_predict(): X_test_sparse = csr_matrix(X_test) for _ in range(300): clf.predict(X_test_sparse)
'RANSACRegressor':RANSACRegressor(), 'RBFSampler':RBFSampler(), 'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(), 'SkewedChi2Sampler':SkewedChi2Sampler(), 'SparsePCA':SparsePCA(), 'SparseRandomProjection':SparseRandomProjection(), 'SpectralBiclustering':SpectralBiclustering(), 'SpectralClustering':SpectralClustering(), 'SpectralCoclustering':SpectralCoclustering(), 'SpectralEmbedding':SpectralEmbedding(),
] classifiers = [ RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), # GradientBoostingRegressor(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] # HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate), DecisionTreeRegressor( random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 ExtraTreeRegressor(random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 SVR() # C: 0.25, 0.5, 1, 5, 10 ] selectors = [ reliefF.reliefF, fisher_score.fisher_score,
(GradientBoostingClassifier(max_depth=10, n_estimators=10), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (LogisticRegression(), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (IsotonicRegression(out_of_bounds='clip'), ['predict'], create_isotonic_regression_problem_1()), (Earth(), ['predict', 'transform'], create_regression_problem_1()), (Earth(allow_missing=True), ['predict', 'transform'], create_regression_problem_with_missingness_1()), (ElasticNet(), ['predict'], create_regression_problem_1()), (ElasticNetCV(), ['predict'], create_regression_problem_1()), (LassoCV(), ['predict'], create_regression_problem_1()), (Ridge(), ['predict'], create_regression_problem_1()), (RidgeCV(), ['predict'], create_regression_problem_1()), (SGDRegressor(), ['predict'], create_regression_problem_1()), (Lasso(), ['predict'], create_regression_problem_1()), (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]), ['predict', 'predict_proba'], create_weird_classification_problem_1()), (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))], transformer_weights={ 'earth': 1, 'earth2': 2 }), ['transform'], create_weird_classification_problem_1()), (RandomForestRegressor(), ['predict'], create_regression_problem_1()), (CalibratedClassifierCV(LogisticRegression(), 'isotonic'), ['predict_proba'], create_weird_classification_problem_1()), (AdaBoostRegressor(), ['predict'], create_regression_problem_1()), (BaggingRegressor(), ['predict'], create_regression_problem_1()), (BaggingClassifier(), ['predict_proba'],