Beispiel #1
0
 def train(self, epoch_count=40, learning_rate=0.06):
     self.regressor = SGDRegressor(max_iter=epoch_count,
                                   tol=1e-5,
                                   learning_rate='constant',
                                   eta0=learning_rate)
     # Running the training by calling the library method
     self.regressor.fit(self.X_train, self.Y_train)
Beispiel #2
0
    def __init__(
            self,
            timeseries,
            dataname,
            # parameter for SGD regression
            loss="squared_loss",
            penalty="l2",
            alpha=0.0001,
            fit_intercept=True,
            tol=1e-3,
            learning_rate="invscaling",
            l1_ratio=0.15,
            epsilon=0.1,
            max_iter=1000,
            eta0=0.01,
            power_t=0.5,
            average=False,
            #feature extraction parameter
            Window_size=20,
            Difference=False,
            time_feature=True,
            tsfresh_feature=True,
            forecasting_steps=25,
            n_splits=5,
            max_train_size=None,
            NAN_threshold=0.05):

        self.loss = loss
        self.penalty = penalty
        self.alpha = float(alpha)
        self.fit_intercept = fit_intercept
        self.tol = float(tol)
        self.learning_rate = learning_rate
        self.l1_ratio = float(l1_ratio) if l1_ratio is not None else 0.15
        self.epsilon = float(epsilon) if epsilon is not None else 0.1
        self.max_iter = int(max_iter)
        self.eta0 = float(eta0)
        self.power_t = float(power_t) if power_t is not None else 0.25
        self.average = average

        self.estimator = SGDRegressor(loss=self.loss,
                                      penalty=self.penalty,
                                      alpha=self.alpha,
                                      fit_intercept=self.fit_intercept,
                                      max_iter=self.max_iter,
                                      tol=self.tol,
                                      learning_rate=self.learning_rate,
                                      l1_ratio=self.l1_ratio,
                                      epsilon=self.epsilon,
                                      eta0=self.eta0,
                                      power_t=self.power_t,
                                      average=self.average,
                                      warm_start=True)

        super().__init__(timeseries, dataname, Window_size, time_feature,
                         Difference, tsfresh_feature, forecasting_steps,
                         n_splits, max_train_size, NAN_threshold)
Beispiel #3
0
 def test_partial_fit(self):
     # define an online pipeline
     piple = OnlinePipeline([
         ('scale', StandardScaler()),
         ('clf',
          SGDRegressor(random_state=5,
                       shuffle=False,
                       verbose=True,
                       max_iter=10)),
     ])
     # define an offline pipelines
     pipl = Pipeline([
         ('scale', StandardScaler()),
         ('clf',
          SGDRegressor(random_state=5,
                       shuffle=False,
                       verbose=True,
                       max_iter=20)),
     ])
     # generate some data
     X, y = make_regression(100, 100, random_state=42)
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=.33,
                                                         random_state=42)
     # fit, predict in an online manner
     for i in range(100):
         piple.partial_fit(X_train[0:30], y_train[0:30])
         piple.partial_fit(X_train[30:], y_train[30:])
     ye = piple.predict(X_test)
     # fit, predict in offline manner
     pipl.fit(X_train, y_train)
     yh = pipl.predict(X_test)
     # compare results
     r2_offline = r2_score(y_test, yh)
     r2_online = r2_score(y_test, ye)
     # use a relatively high tolerance due to differences in results of offline and online
     np.testing.assert_allclose(r2_offline, r2_online, atol=0.1)
Beispiel #4
0
 def test_partial_fit(self):
     # create some data
     x = np.array(list(range(0, 10)))
     y = x * 2
     df = pd.DataFrame({'x': x,
                        'y': y})
     X = df[['x']][0:2]
     Y = df[['y']][0:2]
     # put into Omega
     os.environ['DJANGO_SETTINGS_MODULE'] = ''
     om = Omega()
     om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True
     om.datasets.put(df[['x']], 'datax-full')
     om.datasets.put(X, 'datax')
     om.datasets.put(Y, 'datay')
     om.datasets.get('datax')
     om.datasets.get('datay')
     # create a model locally, store (unfitted) in Omega
     # -- ignore warnings on y shape
     import warnings
     warnings.filterwarnings("ignore", category=DataConversionWarning)
     lr = SGDRegressor()
     om.models.put(lr, 'mymodel2')
     # have Omega fit the model to get a start, then predict
     result = om.runtime.model('mymodel2').fit('datax', 'datay')
     result.get()
     # check the new model version metadata includes the datax/y references
     result = om.runtime.model('mymodel2').predict('datax-full')
     pred1 = result.get()
     mse = mean_squared_error(df.y, pred1)
     self.assertGreater(mse, 90)
     # fit mini batches add better training data, update model
     batch_size = 2
     for i, start in enumerate(range(0, len(df))):
         previous_mse = mse
         X = df[['x']][start:start + batch_size]
         Y = df[['y']][start:start + batch_size]
         om.datasets.put(X, 'datax-update', append=False)
         om.datasets.put(Y, 'datay-update', append=False)
         result = om.runtime.model('mymodel2').partial_fit(
             'datax-update', 'datay-update')
         result.get()
         # check the new model version metadata includes the datax/y
         # references
         result = om.runtime.model('mymodel2').predict('datax-full')
         pred1 = result.get()
         mse = mean_squared_error(df.y, pred1)
         self.assertLess(mse, previous_mse)
     # mse == 0 is most accurate the best
     self.assertLess(mse, 1.0)
Beispiel #5
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.stochastic_gradient import SGDRegressor
        import sklearn.preprocessing

        if refit:
            self.estimator = None
            self.scaler = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDRegressor(loss=self.loss,
                                          penalty=self.penalty,
                                          alpha=self.alpha,
                                          fit_intercept=self.fit_intercept,
                                          n_iter=n_iter,
                                          learning_rate=self.learning_rate,
                                          l1_ratio=self.l1_ratio,
                                          epsilon=self.epsilon,
                                          eta0=self.eta0,
                                          power_t=self.power_t,
                                          shuffle=True,
                                          average=self.average,
                                          random_state=self.random_state)

            self.scaler = sklearn.preprocessing.StandardScaler(copy=True)
            self.scaler.fit(y.reshape((-1, 1)))
        else:
            self.estimator.n_iter += n_iter

        Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel()
        self.estimator.partial_fit(X, Y_scaled)

        if self.estimator.n_iter >= self.n_iter:
            self.fully_fit_ = True

        return self
def demo(output_file=None, instances=40000):
    """ _test_regression

    This demo demonstrates how to evaluate a regressor. The data stream used 
    is an instance of the RegressionGenerator, which feeds an instance from 
    sklearn's SGDRegressor.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    #stream = FileStream(opt, -1, 1)
    #stream = WaveformGenerator()
    #stream.prepare_for_use()
    stream = RegressionGenerator(n_samples=40000)
    # Setup the classifier
    #classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(pretrain_size=1,
                               max_instances=instances,
                               batch_size=1,
                               n_wait=1,
                               max_time=1000,
                               output_file=output_file,
                               task_type='regression',
                               show_plot=True,
                               plot_options=['true_vs_predicts'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
def demo(output_file=None, instances=40000):
    """ _test_regression

    This demo demonstrates how to evaluate a regressor. The data stream used 
    is an instance of the RegressionGenerator, which feeds an instance from 
    sklearn's SGDRegressor.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    # stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    # stream = WaveformGenerator()
    # stream.prepare_for_use()
    stream = RegressionGenerator(n_samples=40000)
    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = PassiveAggressiveClassifier()
    classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=1,
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    output_file=output_file,
                                    show_plot=True,
                                    metrics=['mean_square_error'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
Beispiel #8
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None
            self.scaler = None

        if self.estimator is None:
            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDRegressor(loss=self.loss,
                                          penalty=self.penalty,
                                          alpha=self.alpha,
                                          fit_intercept=self.fit_intercept,
                                          n_iter=self.n_iter,
                                          learning_rate=self.learning_rate,
                                          l1_ratio=self.l1_ratio,
                                          epsilon=self.epsilon,
                                          eta0=self.eta0,
                                          power_t=self.power_t,
                                          shuffle=True,
                                          average=self.average,
                                          random_state=self.random_state)

            self.scaler = sklearn.preprocessing.StandardScaler(copy=True)
            self.scaler.fit(y)

        Y_scaled = self.scaler.transform(y)

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, Y_scaled)
        return self
Beispiel #9
0
from sklearn.linear_model.stochastic_gradient import SGDRegressor
x_train = [[1, 0., 3], [1, 1., 3], [1, 2., 3], [1, 3., 2], [1, 4., 4]]
y_train = [95.364, 97.217205, 75.195834, 60.105519, 49.342380]
model = SGDRegressor(max_iter=5000000,
                     alpha=0.00001)  #[ 45.71878249 -13.02758034   1.14608487]
model.fit(x_train, y_train)
print(model.coef_)
print(model.intercept_)
Beispiel #10
0
                                 "neighbor_" + str(x + 1)
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")


build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True)
build_housing(
    MLPRegressor(activation="tanh",
                 hidden_layer_sizes=(26, ),
                 algorithm="l-bfgs",
                 random_state=13,
                 tol=0.001,
                 max_iter=1000), "MLPHousing")
build_housing(SGDRegressor(random_state=13), "SGDHousing")
build_housing(SVR(), "SVRHousing", to_sparse=True)
build_housing(LinearSVR(random_state=13), "LinearSVRHousing", to_sparse=True)
build_housing(NuSVR(), "NuSVRHousing", to_sparse=True)

housing_df = housing_df.drop("MEDV", axis=1)

housing_anomaly_columns = housing_df.columns.tolist()

housing_anomaly_mapper = DataFrameMapper([
    (housing_anomaly_columns, [ContinuousDomain(),
                               MaxAbsScaler()])
])

housing_anomaly_X = housing_anomaly_mapper.fit_transform(housing_df)
np.random.shuffle(inds)
coef[inds[n_features / 2:]] = 0  # sparsify coef
print("true coef sparsity: %f" % sparsity_ratio(coef))
y = np.dot(X, coef)

# add noise
y += 0.01 * np.random.normal((n_samples, ))

# Split data in train set and test set
n_samples = X.shape[0]
X_train, y_train = X[:n_samples / 2], y[:n_samples / 2]
X_test, y_test = X[n_samples / 2:], y[n_samples / 2:]
print("test data sparsity: %f" % sparsity_ratio(X_test))

###############################################################################
clf = SGDRegressor(penalty='l1', alpha=.2, fit_intercept=True, n_iter=2000)
clf.fit(X_train, y_train)
print("model sparsity: %f" % sparsity_ratio(clf.coef_))


@profile
def benchmark_dense_predict():
    for _ in range(300):
        clf.predict(X_test)


@profile
def benchmark_sparse_predict():
    X_test_sparse = csr_matrix(X_test)
    for _ in range(300):
        clf.predict(X_test_sparse)
Beispiel #12
0
# #############################################################################
# Benchmark bulk/atomic prediction speed for various regressors
configuration = {
    'n_train':
    int(1e3),
    'n_test':
    int(1e2),
    'n_features':
    int(1e2),
    'estimators': [
        {
            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         tol=1e-4),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
Beispiel #13
0

start_time = time.time()

configuration = {
    'n_train':
    int(1e3),
    'n_test':
    int(1e2),
    'n_features':
    int(1e2),
    'estimators': [{
        'name':
        "Linear Model",
        'instance':
        SGDRegressor(penalty='l2', alpha=0.1, tol=1e-4),
        'complexity_label':
        'non-zero coefficients',
        'complexity_computer':
        lambda clf: np.count_nonzero(clf.coef_)
    }, {
        'name': 'RandomForest',
        'instance': RandomForestRegressor(),
        'complexity_label': 'estimator',
        'complexity_computer': lambda clf: clf.n_estimators
    }, {
        'name': 'SVR',
        'instance': SVR(kernel='rbf'),
        'complexity_label': 'support vectors',
        'complexity_computer': lambda clf: len(clf.support_vectors_)
    }]
Beispiel #14
0
# #############################################################################
# Main code

start_time = time.time()

# #############################################################################
# Benchmark bulk/atomic prediction speed for various regressors
configuration = {
    'n_train': int(1e3),
    'n_test': int(1e2),
    'n_features': int(1e2),
    'estimators': [
        {'name': 'Linear Model',
         'instance': SGDRegressor(penalty='elasticnet', alpha=0.01,
                                  l1_ratio=0.25, fit_intercept=True,
                                  tol=1e-4),
         'complexity_label': 'non-zero coefficients',
         'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)},
        {'name': 'RandomForest',
         'instance': RandomForestRegressor(n_estimators=100),
         'complexity_label': 'estimators',
         'complexity_computer': lambda clf: clf.n_estimators},
        {'name': 'SVR',
         'instance': SVR(kernel='rbf'),
         'complexity_label': 'support vectors',
         'complexity_computer': lambda clf: len(clf.support_vectors_)},
    ]
}
benchmark(configuration)
Beispiel #15
0
    def iterative_fit(self, X, y, n_iter=2, refit=False):
        from sklearn.linear_model.stochastic_gradient import SGDRegressor
        import sklearn.preprocessing

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.
        n_iter = max(n_iter, 2)

        if refit:
            self.estimator = None
            self.scaler = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.tol = float(self.tol)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDRegressor(loss=self.loss,
                                          penalty=self.penalty,
                                          alpha=self.alpha,
                                          fit_intercept=self.fit_intercept,
                                          max_iter=n_iter,
                                          tol=self.tol,
                                          learning_rate=self.learning_rate,
                                          l1_ratio=self.l1_ratio,
                                          epsilon=self.epsilon,
                                          eta0=self.eta0,
                                          power_t=self.power_t,
                                          shuffle=True,
                                          average=self.average,
                                          random_state=self.random_state,
                                          warm_start=True)

            self.scaler = sklearn.preprocessing.StandardScaler(copy=True)
            self.scaler.fit(y.reshape((-1, 1)))
            Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel()
            self.estimator.fit(X, Y_scaled)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 1000)
            Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel()
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, Y_scaled,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self
Beispiel #16
0
#%%观察数据
# 定义绘图辅助函数
def plt_helper(label, title, xlabel='x 轴', ylabel='y 轴'):
    fig = plt.figure()
    ax = fig.add_subplot(111, label=label)
    ax.set_title(title, fontproperties=myfont)
    ax.set_xlabel(xlabel, fontproperties=myfont)
    ax.set_ylabel(ylabel, fontproperties=myfont)
    ax.grid(True)
    return ax


ax1 = plt_helper('ax1', '观察模拟数据的分布')
ax1.plot(X[:, 0], y, 'r*')
#%%
linear_SGD = SGDRegressor(loss='squared_loss', max_iter=100)
linear_SGD.fit(train_x, train_y)
y_SGD = linear_SGD.predict(test_x)

linear_rg = LinearRegression(
    fit_intercept=True,  #计算截距
    normalize=False,  #回归之前不对数据集进行规范化处理
    copy_X=True,  #复制X,不会对X的原始值产生影响
    n_jobs=-1)  #使用所有的CPU
linear_rg.fit(train_x, train_y)
y_rg = linear_rg.predict(test_x)

print('模拟数据参数', coef)
print('SGDRegressor模型参数', linear_SGD.coef_)
print('LinearRegression模型参数', linear_rg.coef_)
    ]) + 1  #check how far is that index in the dropdown list and return that value


def average_lowest_correct(list_of_trues, list_of_preds):
    length = len(list_of_trues)  # number of data points
    return np.mean([
        lowest_correct(list(list_of_trues.iloc[i]), list(list_of_preds[i]))
        for i in range(length)
    ])


# Top four models selected formatted as a pipteline to be used for gridsearch
model_1 = Pipeline([('md1', MultiOutputRegressor(Ridge()))])
model_2 = Pipeline([('md2', MultiOutputRegressor(KernelRidge()))])
model_3 = Pipeline([('md3', MultiOutputRegressor(LinearSVR()))])
model_4 = Pipeline([('md4', MultiOutputRegressor(SGDRegressor()))])

# Dictionary of all the variable hyperparameters for all four models. Except of the SGD regressor, the hyperparameter list is complete.
model_params = {
    'Multi_Ridge': {
        'model': model_1,
        'params': {
            'md1__estimator__normalize': [True, False],
            'md1__estimator__fit_intercept': [True, False],
            'md1__estimator__solver':
            ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
            'md1__estimator__alpha': [i for i in range(10, 110, 10)],
            'md1__estimator__max_iter': [1000, 2000, 3000]
        }
    },
    'Multi_KernelRidge': {
Beispiel #18
0
print("true coef sparsity: %f" % sparsity_ratio(coef))
y = np.dot(X, coef)

# add noise
y += 0.01 * np.random.normal((n_samples, ))

# Split data in train set and test set
n_samples = X.shape[0]
X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
print("test data sparsity: %f" % sparsity_ratio(X_test))

###############################################################################
clf = SGDRegressor(penalty='l1',
                   alpha=.2,
                   fit_intercept=True,
                   max_iter=2000,
                   tol=None)
clf.fit(X_train, y_train)
print("model sparsity: %f" % sparsity_ratio(clf.coef_))


def benchmark_dense_predict():
    for _ in range(300):
        clf.predict(X_test)


def benchmark_sparse_predict():
    X_test_sparse = csr_matrix(X_test)
    for _ in range(300):
        clf.predict(X_test_sparse)
Beispiel #19
0
    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDRegressor

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            if not check_none(self.epsilon_insensitive):
                self.epsilon_insensitive = float(self.epsilon_insensitive)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon_huber = float(self.epsilon_huber) if self.epsilon_huber is not None \
                else 0.1
            self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            if self.loss == "huber":
                epsilon = self.epsilon_huber
            elif self.loss in [
                    "epsilon_insensitive", "squared_epsilon_insensitive"
            ]:
                epsilon = self.epsilon_insensitive
            else:
                epsilon = None
            self.estimator = SGDRegressor(loss=self.loss,
                                          penalty=self.penalty,
                                          alpha=self.alpha,
                                          fit_intercept=self.fit_intercept,
                                          max_iter=n_iter,
                                          tol=self.tol,
                                          learning_rate=self.learning_rate,
                                          l1_ratio=self.l1_ratio,
                                          epsilon=epsilon,
                                          eta0=self.eta0,
                                          power_t=self.power_t,
                                          shuffle=True,
                                          average=self.average,
                                          random_state=self.random_state,
                                          warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                coef_init=None,
                intercept_init=None)

        if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self
# Apply scaler on training and test data
standardized_X_train = X_scaler.transform(X_train)
standardized_y_train = y_scaler.transform(y_train.values.reshape(-1,
                                                                 1)).ravel()
standardized_X_test = X_scaler.transform(X_test)
standardized_y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel()

# Check
print("mean:", np.mean(standardized_X_train, axis=0),
      np.mean(standardized_y_train, axis=0))  # mean should be ~0
print("std:", np.std(standardized_X_train, axis=0),
      np.std(standardized_y_train, axis=0))  # std should be 1

# Initialize the model
lm = SGDRegressor(loss="squared_loss",
                  penalty="none",
                  max_iter=args.num_epochs)

# Train
lm.fit(X=standardized_X_train, y=standardized_y_train)

# Predictions (unstandardize them)
pred_train = (lm.predict(standardized_X_train) *
              np.sqrt(y_scaler.var_)) + y_scaler.mean_
pred_test = (lm.predict(standardized_X_test) *
             np.sqrt(y_scaler.var_)) + y_scaler.mean_

# Train and test MSE
train_mse = np.mean((y_train - pred_train)**2)
test_mse = np.mean((y_test - pred_test)**2)
print("train_MSE: {0:.2f}, test_MSE: {1:.2f}".format(train_mse, test_mse))
y = scaler.fit_transform(Y)

'''
regularized training error given by:
E(w, b) = 1/n * sum(L(yi, f(xi))) + alpha * R(w)
Note: L is loss function, R(w) is regularization term (penalty)

For Elastic Net R(w):
R(w) = p/2 * sum(wi^2) + (1 - p) * |wi| where p is given by 1 - l1_ratio

For inverse scaling learning_rate:
lr = eta0 / t^power_t

'''
regr = SGDRegressor(penalty = 'elasticnet', alpha = 0.0001, l1_ratio = 0.25, 
                    learning_rate = 'invscaling', eta0 = 0.01, power_t = 0.25, 
                    loss = 'epsilon_insensitive', epsilon = 0.1, shuffle = True, 
                    fit_intercept = True, n_iter = 1000000, average = False, verbose = 0)

regr.fit(x, y)
data_pred = regr.predict(x)
y_pred = scaler.inverse_transform(data_pred)

print('coefficients: \n', regr.coef_)

#if data is expected to be already centered then intercept_ is not needed
print('intercept: \n', regr.intercept_)

#Calculate mean squared error
print('Mean Squared Error: %.4f' 
      % mean_squared_error(y, data_pred))
from sklearn.preprocessing import StandardScaler

#%% PART I
# load ex1data1.txt - linear regression with one parameter
    
data1=pd.read_csv("data/ex1data1.txt",names=["X","y"])
x=data1.X.values[:,None]
y=data1.y.values

poly=PolynomialFeatures(1)
X=poly.fit_transform(x)

#%% use sklearn

# pick models
regr_gd=SGDRegressor(fit_intercept=False,alpha=0.00001,max_iter=10000)
regr_lr=LinearRegression(fit_intercept=False)

# feed data
regr_gd.fit(X,y)
regr_lr.fit(X,y)

#%% plot the solution via the Gradient Decent

ind=x.argsort(axis=0).flatten()

fig,ax=plt.subplots() # create empty figure
plt.plot(x,y,'rx',label='Training data')
plt.plot(x[ind],X[ind,:].dot(regr_lr.coef_),'-k',label='lin. reg. (sklearn)')
plt.plot(x[ind],X[ind,:].dot(regr_gd.coef_),'-b',label='stoch. grad. descent (sklearn)')
ax.set_xlabel("Population of City in 10,000s")
# benchmark bulk/atomic prediction speed for various regressors
configuration = {
    'n_train':
    int(1e3),
    'n_test':
    int(1e2),
    'n_features':
    int(1e2),
    'estimators': [
        {
            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         fit_intercept=True),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
Beispiel #24
0
 def __init__(self, data):
     self.df = pd.read_csv(data)
     self.regressor = SGDRegressor(max_iter=40,
                                   tol=1e-5,
                                   learning_rate='constant',
                                   eta0=0.06)
Beispiel #25
0
x_scaler = StandardScaler().fit(x_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1, 1))

#transform: 执行数据标准化
#测试数据和预测数据的标准化的方式要和训练数据标准化的方式一样, 必须用同一个scaler来进行transform
#数据标准化公式,每个数据均是  (x - 平均值)/标准差
standardized_x_train = x_scaler.transform(x_train)
standardized_y_train = y_scaler.transform(y_train.values.reshape(-1,
                                                                 1)).ravel()
standardized_x_test = x_scaler.transform(x_test)
standardized_y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel()

#loss:要是用的损失函数,默认是squared_loss,方差拟合
#penalty:使用的惩罚
lm = SGDRegressor(loss='squared_loss',
                  penalty='none',
                  max_iter=args.num_epochs)
#使用梯度下降模型
lm.fit(X=standardized_x_train, y=standardized_y_train)
#predict:进行数据预测
#scaler.var_是方差,np.sqrt(y_scaler.var_)就是标准差,y_scaler.scale_也是标准差,是一样的
#实际输出结果就是 (模拟的输出结果 * 标准差) + 平均数 和标准化过程刚好相反
pred_train = (lm.predict(standardized_x_train) *
              y_scaler.scale_) + y_scaler.mean_
pred_test = (lm.predict(standardized_x_test) *
             np.sqrt(y_scaler.var_)) + y_scaler.mean_

#测试我们自己的数据
X_infer = np.array((0, 1, 2), dtype=np.float32)
standardized_X_infer = x_scaler.transform(X_infer.reshape(-1, 1))
pred_infer = (lm.predict(standardized_X_infer) *
Beispiel #26
0
np.random.shuffle(inds)
coef[inds[n_features // 2:]] = 0  # sparsify coef
print("true coef sparsity: %f" % sparsity_ratio(coef))
y = np.dot(X, coef)

# add noise
y += 0.01 * np.random.normal((n_samples, ))

# Split data in train set and test set
n_samples = X.shape[0]
X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
print("test data sparsity: %f" % sparsity_ratio(X_test))

###############################################################################
clf = SGDRegressor(penalty='l1', alpha=.2, max_iter=2000, tol=None)
clf.fit(X_train, y_train)
print("model sparsity: %f" % sparsity_ratio(clf.coef_))


def benchmark_dense_predict():
    for _ in range(300):
        clf.predict(X_test)


def benchmark_sparse_predict():
    X_test_sparse = csr_matrix(X_test)
    for _ in range(300):
        clf.predict(X_test_sparse)

Beispiel #27
0
			'RANSACRegressor':RANSACRegressor(),
			'RBFSampler':RBFSampler(),
			'RadiusNeighborsClassifier':RadiusNeighborsClassifier(),
			'RadiusNeighborsRegressor':RadiusNeighborsRegressor(),
			'RandomForestClassifier':RandomForestClassifier(),
			'RandomForestRegressor':RandomForestRegressor(),
			'RandomizedLasso':RandomizedLasso(),
			'RandomizedLogisticRegression':RandomizedLogisticRegression(),
			'RandomizedPCA':RandomizedPCA(),
			'Ridge':Ridge(),
			'RidgeCV':RidgeCV(),
			'RidgeClassifier':RidgeClassifier(),
			'RidgeClassifierCV':RidgeClassifierCV(),
			'RobustScaler':RobustScaler(),
			'SGDClassifier':SGDClassifier(),
			'SGDRegressor':SGDRegressor(),
			'SVC':SVC(),
			'SVR':SVR(),
			'SelectFdr':SelectFdr(),
			'SelectFpr':SelectFpr(),
			'SelectFwe':SelectFwe(),
			'SelectKBest':SelectKBest(),
			'SelectPercentile':SelectPercentile(),
			'ShrunkCovariance':ShrunkCovariance(),
			'SkewedChi2Sampler':SkewedChi2Sampler(),
			'SparsePCA':SparsePCA(),
			'SparseRandomProjection':SparseRandomProjection(),
			'SpectralBiclustering':SpectralBiclustering(),
			'SpectralClustering':SpectralClustering(),
			'SpectralCoclustering':SpectralCoclustering(),
			'SpectralEmbedding':SpectralEmbedding(),
Beispiel #28
0
]

classifiers = [
    RandomForestRegressor(n_estimators=200, n_jobs=5,
                          random_state=randomstate),
    ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate),
    # GradientBoostingRegressor(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    # HistGradientBoostingClassifier(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    AdaBoostRegressor(n_estimators=200, random_state=randomstate),
    GaussianProcessRegressor(normalize_y=True),
    ARDRegression(),
    # HuberRegressor(),   # epsilon:  greater than 1.0, default 1.35
    LinearRegression(n_jobs=5),
    PassiveAggressiveRegressor(
        random_state=randomstate),  # C: 0.25, 0.5, 1, 5, 10
    SGDRegressor(random_state=randomstate),
    TheilSenRegressor(n_jobs=5, random_state=randomstate),
    RANSACRegressor(random_state=randomstate),
    KNeighborsRegressor(
        weights='distance'),  # n_neighbors: 3, 6, 9, 12, 15, 20
    RadiusNeighborsRegressor(weights='distance'),  # radius: 1, 2, 5, 10, 15
    MLPRegressor(max_iter=10000000, random_state=randomstate),
    DecisionTreeRegressor(
        random_state=randomstate),  # max_depth = 2, 3, 4, 6, 8
    ExtraTreeRegressor(random_state=randomstate),  # max_depth = 2, 3, 4, 6, 8
    SVR()  # C: 0.25, 0.5, 1, 5, 10
]

selectors = [
    reliefF.reliefF,
    fisher_score.fisher_score,
 (GradientBoostingClassifier(max_depth=10,
                             n_estimators=10), ['predict_proba', 'predict'],
  create_weird_classification_problem_1()),
 (LogisticRegression(), ['predict_proba', 'predict'],
  create_weird_classification_problem_1()),
 (IsotonicRegression(out_of_bounds='clip'), ['predict'],
  create_isotonic_regression_problem_1()),
 (Earth(), ['predict', 'transform'], create_regression_problem_1()),
 (Earth(allow_missing=True), ['predict', 'transform'],
  create_regression_problem_with_missingness_1()),
 (ElasticNet(), ['predict'], create_regression_problem_1()),
 (ElasticNetCV(), ['predict'], create_regression_problem_1()),
 (LassoCV(), ['predict'], create_regression_problem_1()),
 (Ridge(), ['predict'], create_regression_problem_1()),
 (RidgeCV(), ['predict'], create_regression_problem_1()),
 (SGDRegressor(), ['predict'], create_regression_problem_1()),
 (Lasso(), ['predict'], create_regression_problem_1()),
 (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
  ['predict', 'predict_proba'], create_weird_classification_problem_1()),
 (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
               transformer_weights={
                   'earth': 1,
                   'earth2': 2
               }), ['transform'], create_weird_classification_problem_1()),
 (RandomForestRegressor(), ['predict'], create_regression_problem_1()),
 (CalibratedClassifierCV(LogisticRegression(),
                         'isotonic'), ['predict_proba'],
  create_weird_classification_problem_1()),
 (AdaBoostRegressor(), ['predict'], create_regression_problem_1()),
 (BaggingRegressor(), ['predict'], create_regression_problem_1()),
 (BaggingClassifier(), ['predict_proba'],