Beispiel #1
0
def dict_method_reg():
    warnings.filterwarnings("ignore")
    kernel = 1.0 * RBF(1.0)
    kernel2 = Matern(nu=1.5)
    kernel3 = Matern(nu=0.5)
    kernel4 = Matern(nu=2)
    kernel5 = Matern(length_scale=2, nu=1.5)
    kernel6 = Matern(length_scale=2, nu=0.5)
    kernel7 = Matern(length_scale=2, nu=2)
    kernel8 = Matern(length_scale=2, nu=1)
    dict_method = {}
    # 1st part
    """1SVR"""

    me1 = SVR(kernel='rbf',
              gamma='auto',
              degree=4,
              tol=1e-3,
              epsilon=0.1,
              shrinking=True,
              max_iter=2000)
    cv1 = 5
    scoring1 = "r2"
    param_grid1 = [{
        'C': [1, 0.5, 0.1, 0.01],
        'gamma': [0.5, 0.1, 0.001, 0.01, 0.0001],
        "epsilon": [1, 0.1, 0.01, 0.001],
        "kernel": [kernel, kernel2, kernel3, kernel4],
    }]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})
    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06,
                        alpha_2=1e-06,
                        compute_score=False,
                        copy_X=True,
                        fit_intercept=True,
                        lambda_1=1e-06,
                        lambda_2=1e-06,
                        n_iter=300,
                        normalize=False,
                        tol=0.01,
                        verbose=False)
    cv2 = 5
    scoring2 = "r2"
    param_grid2 = [{
        'alpha_1': [1e-07, 1e-06, 1e-05, 1e-04, 1e-03],
        'alpha_2': [1e-07, 1e-06, 1e-05, 1e-04, 1e-03],
        'lambda_1': [1e-06, 1e-05, 1e-07],
        'lambda_2': [1e-06, 1e-05, 1e-07],
    }]
    dict_method.update({'BayR-set': [me2, cv2, scoring2, param_grid2]})
    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001,
                       average=False,
                       epsilon=0.1,
                       eta0=0.01,
                       fit_intercept=True,
                       l1_ratio=0.15,
                       learning_rate='invscaling',
                       loss='squared_loss',
                       max_iter=1000,
                       penalty='l2',
                       power_t=0.25,
                       random_state=0,
                       shuffle=True,
                       tol=0.01,
                       verbose=0,
                       warm_start=False)
    cv3 = 5
    scoring3 = "r2"
    param_grid3 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05]}]
    dict_method.update({'SGDRL2-set': [me3, cv3, scoring3, param_grid3]})
    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5,
                                        weights='uniform',
                                        algorithm='auto',
                                        leaf_size=30,
                                        p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = "r2"
    param_grid4 = [{'n_neighbors': [3, 4, 5, 6], 'leaf_size': [10, 20, 30]}]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})
    """5kernelridge"""

    me5 = kernel_ridge.KernelRidge(alpha=1,
                                   kernel=kernel,
                                   gamma="scale",
                                   degree=3,
                                   coef0=1,
                                   kernel_params=None)
    cv5 = 5
    scoring5 = "r2"
    param_grid5 = [{
        'alpha': [10, 7, 5, 3, 2, 1, 0.5, 0.1],
        "kernel": [
            kernel, kernel2, kernel3, kernel4, kernel5, kernel6, kernel7,
            kernel8
        ]
    }]
    dict_method.update({'KR-set': [me5, cv5, scoring5, param_grid5]})
    """6GPR"""
    kernel = 1.0 * RBF(1.0)
    kernel2 = Matern(nu=1.5)
    kernel3 = Matern(nu=0.5)
    kernel4 = Matern(nu=2)

    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel,
                                                    alpha=1e-10,
                                                    normalize_y=True,
                                                    copy_X_train=True,
                                                    random_state=0)
    cv6 = 5
    scoring6 = "r2"
    param_grid6 = [{
        'alpha': [1e-10, 1e-8, 1e-6, 0.0001, 0.01, 1],
        "kernel": [kernel, kernel2, kernel3, kernel4],
        "random_state": [0, 1, 2]
    }]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part
    """6RFR"""
    me7 = ensemble.RandomForestRegressor(n_estimators=200,
                                         max_depth=None,
                                         min_samples_split=2,
                                         min_samples_leaf=1,
                                         min_weight_fraction_leaf=0.0,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         bootstrap=True,
                                         oob_score=False,
                                         random_state=None,
                                         verbose=0,
                                         warm_start=False)
    cv7 = 5
    scoring7 = "r2"
    param_grid7 = [{
        'max_depth': [3, 5, 8, 10],
        'min_samples_split': [2, 3, 4],
        'random_state': [0, 1, 2],
        'n_estimators': [500, 200]
    }]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})
    """7GBR"""

    me8 = ensemble.GradientBoostingRegressor(loss='ls',
                                             learning_rate=0.05,
                                             n_estimators=200,
                                             subsample=1.0,
                                             criterion='mse',
                                             min_samples_split=2,
                                             min_samples_leaf=1,
                                             min_weight_fraction_leaf=0.,
                                             max_depth=3,
                                             min_impurity_decrease=0.,
                                             min_impurity_split=None,
                                             init=None,
                                             random_state=None,
                                             max_features=None,
                                             alpha=0.9,
                                             verbose=0,
                                             max_leaf_nodes=None,
                                             warm_start=False,
                                             presort='auto')
    cv8 = 5
    scoring8 = "r2"
    param_grid8 = [{
        'max_depth': [3, 5, 8, 10, 12, 14],
        'min_samples_split': [2, 3, 4],
        'min_samples_leaf': [2, 3],
        'random_state': [0, 1, 2],
        'n_estimators': [50, 100, 200, 300]
    }]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt2 = DecisionTreeRegressor(criterion="mse",
                                splitter="best",
                                max_features=None,
                                max_depth=12,
                                min_samples_split=4)
    dt3 = DecisionTreeRegressor(criterion="mse",
                                splitter="best",
                                max_features=None,
                                max_depth=14,
                                min_samples_split=4)
    dt4 = DecisionTreeRegressor(criterion="mse",
                                splitter="best",
                                max_features=None,
                                max_depth=16,
                                min_samples_split=4)
    dt = [dt4, dt2, dt3]
    me9 = AdaBoostRegressor(dt,
                            n_estimators=200,
                            learning_rate=0.5,
                            loss='linear',
                            random_state=0)
    cv9 = 5
    scoring9 = "r2"
    param_grid9 = [{
        'n_estimators': [50, 100, 200],
        "base_estimator": dt,
        "learning_rate": [0.05, 0.5, 1],
        'random_state': [0, 1, 2]
    }]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})
    '''TreeR'''
    me10 = DecisionTreeRegressor(criterion='mse',
                                 splitter='best',
                                 max_depth=None,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=None,
                                 random_state=0,
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None,
                                 presort=False)
    cv10 = 5
    scoring10 = "r2"
    param_grid10 = [{
        'max_depth': [4, 5, 6],
        'min_samples_split': [3, 4],
        'random_state': [0, 1, 2]
    }]
    dict_method.update({'TreeC-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0,
                      l1_ratio=0.7,
                      fit_intercept=True,
                      normalize=False,
                      precompute=False,
                      max_iter=1000,
                      copy_X=True,
                      tol=0.0001,
                      warm_start=False,
                      positive=False,
                      random_state=None)

    cv11 = 5
    scoring11 = "r2"
    param_grid11 = [{
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.3, 0.5, 0.8]
    }]
    dict_method.update(
        {"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(
        alpha=1.0,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        copy_X=True,
        max_iter=1000,
        tol=0.001,
        warm_start=False,
        positive=False,
        random_state=None,
    )

    cv12 = 5
    scoring12 = "r2"
    param_grid12 = [
        {
            'alpha': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100,
                1000
            ]
        },
    ]
    dict_method.update({"Lasso-L1": [me12, cv12, scoring12, param_grid12]})
    """SGDRL1"""
    me13 = SGDRegressor(alpha=0.0001,
                        average=False,
                        epsilon=0.1,
                        eta0=0.01,
                        fit_intercept=True,
                        l1_ratio=0.15,
                        learning_rate='invscaling',
                        loss='squared_loss',
                        max_iter=1000,
                        penalty='l1',
                        power_t=0.25,
                        random_state=0,
                        shuffle=True,
                        tol=0.01,
                        verbose=0,
                        warm_start=False)
    cv13 = 5
    scoring13 = "r2"
    param_grid13 = [{
        'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7]
    }]
    dict_method.update({'SGDR-L1': [me13, cv13, scoring13, param_grid13]})

    return dict_method
eta0 = 0.01
max_iter = 100






from sklearn.model_selection import train_test_split


X_train_dataset, X_test, y_train_dataset, y_test = train_test_split(
            X_scaled,y, test_size=0.2, random_state=42)

sgd_regressor = SGDRegressor(
    eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant")

rmse_val_score = []
rmse_train_score = []
model_list = []

X_train, X_val, y_train, y_val = train_test_split(
    X_train_dataset,y_train_dataset, test_size=0.2, random_state=42)
sgd_regressor.fit(X_train,y_train)

# kf = KFold(n_splits=100, shuffle=True)
# for train_index, test_index in kf.split(X_train_dataset):

for i in range(300):

    y_pred = sgd_regressor.predict(X_train)
                                                                           ('tfidf_transformer', TfidfTransformer())])
                                                 )


## prediction
def fillna_and_log(x):
    x = x.copy()
    x[np.isnan(x)] = 0
    return np.log(1 + x)


from sklearn.linear_model import SGDRegressor, SGDClassifier

title_word_1_2gram_dtm_0_predict_log_price = PredictionFeature('title_word_1_2gram_dtm_0_predict_log_price',
                                                               title_word_1_2gram_dtm_0,
                                                               SGDRegressor(penalty='elasticnet', l1_ratio=0.7,
                                                                            random_state=132, n_iter=20), price,
                                                               y_transformer=fillna_and_log, keep_true=True,
                                                               true_name='log_price')
title_word_1_2gram_dtm_0_predict_is_test = PredictionFeature('title_word_1_2gram_dtm_0_predict_is_test',
                                                             title_word_1_2gram_dtm_0, \
                                                             SGDClassifier(penalty='elasticnet', l1_ratio=0.7,
                                                                           random_state=132, n_iter=20), is_test,
                                                             y_transformer=None, keep_true=False,
                                                             only_predict=True, predict_binary_probability=True,
                                                             true_name='')

title_description_dtm_0_predict_log_price = PredictionFeature('title_description_dtm_0_predict_log_price',
                                                              title_description_dtm_0,
                                                              SGDRegressor(penalty='elasticnet', l1_ratio=0.7,
                                                                           random_state=133, n_iter=30), price,
                                                              y_transformer=fillna_and_log)
Beispiel #4
0
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(
        C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive",
        tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False,
                                                    max_features=0.8,
                                                    min_samples_leaf=19,
                                                    min_samples_split=10,
                                                    n_estimators=400)),
    ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"),
    StackingEstimator(estimator=RidgeCV()), RidgeCV())


    preds = []



    from sklearn.linear_model import (LinearRegression, SGDRegressor)

    import lightgbm as lgb



    sgdr= SGDRegressor(

        penalty = 'l2' ,

        random_state = SEED )

    lgb_params = {

                  'feature_fraction': 0.75,

                  'metric': 'rmse',

                  'nthread':1,

                  'min_data_in_leaf': 2**7,

                  'bagging_fraction': 0.75,
Beispiel #6
0
# scikit-learn 中的 SGD 随机梯度下降法
# 只能解决线性模型
from time import time
from sklearn.linear_model import SGDRegressor
from GradientDescent.own_SGD import X_train_standard, X_test_standard, y_train, y_test

# 初始化
sgd_reg = SGDRegressor()
# 进行训练
start_time1 = time()
sgd_reg.fit(X_train_standard, y_train)
end_time1 = time()
sgd_reg.score(X_test_standard, y_test)

if __name__ == "__main__":
    print(end_time1 - start_time1)
Beispiel #7
0
def run(config, train=True):
    """
    Trains our pipeline according to the configuration provided.
    """

    train_dir = config["train_dir"]
    val_dir = config["val_dir"]

    print("Reading in data...")

    train_data = NucleiDataset(train_dir).load()
    val_data = NucleiDataset(val_dir).load()

    x_train = train_data.images_
    y_train = train_data.masks_  # value in 0, 1, ..., n
    y_train_bin = (y_train > 0).astype(y_train.dtype)  # value in {0, 1}
    x_val = val_data.images_
    y_val = val_data.masks_
    y_val_bin = (y_val > 0).astype(y_val.dtype)

    print("Preprocessing data...")

    preprocesser = Preprocesser()

    x_train_pre = preprocesser.fit_transform(x_train)
    x_val_pre = preprocesser.transform(x_val)

    bilateral_d = 2
    bilateral_sigma_color = 75
    bilateral_sigma_space = 75
    equalize_hist_clip_limit = 0.03
    dialation_kernel = disk(radius=3)
    dialation_iters = 1

    print("Transforming data...")

    print(x_train_pre.min())
    print(x_train_pre.max())
    print(x_val_pre.min())
    print(x_val_pre.max())

    transformer = BasisTransformer(
        bilateral_d=bilateral_d,
        bilateral_sigma_color=bilateral_sigma_color,
        bilateral_sigma_space=bilateral_sigma_space,
        equalize_hist_clip_limit=equalize_hist_clip_limit,
        dialation_kernel=dialation_kernel,
        dialation_iters=dialation_iters)

    x_train_feat = transformer.fit_transform(x_train_pre)
    x_val_feat = transformer.fit_transform(x_val_pre)

    sgd_params = {
        "regressor":
        SGDRegressor(penalty='elasticnet', l1_ratio=0.11, max_iter=5,
                     tol=None),
        "batch_size":
        1000,
        "num_iters":
        25000,
    }
    pa_params = {
        "regressor": PassiveAggressiveRegressor(C=.2, max_iter=5, tol=None),
        "batch_size": 1000,
        "num_iters": 25000,
    }

    sgd = MiniBatchRegressor(**sgd_params)
    pa = MiniBatchRegressor(**pa_params)

    print("Fitting linear models...")

    sgd.fit(x_train_feat, y_train_bin)
    pa.fit(x_train_feat, y_train_bin)

    x_train_extended = extend_features(x_train_feat, sgd, pa)
    x_val_extended = extend_features(x_val_feat, sgd, pa)

    #   Now we train UNet
    numchannels = x_train_extended.shape[-1]
    unet_config = {
        "numchannels": numchannels,
        "epochs": 50,
        "callbacks": [],
        "weights": none
    }
    unet = UNet(**unet_config)

    if unet_config["weights"] is not None:
        unet.load_weights(unet_config["weights"])

    print("Fitting UNet...")

    unet.fit(x_train_extended, y_train_bin, x_val_extended, y_val_bin)

    #   begin inference and print out test scores
    x_train_pred = unet.predict(x_train_extended)
    x_val_pred = unet.predict(x_val_extended)

    segmenter_params = {"nms_min_distance": 3, "watershed_line": True}
    segmenter = NucleiSegmenter(**segmenter_params)

    print("Segmenting nuclei...")

    train_components = segmenter.fit_transform(x_train_pred, x_train_pre)
    val_components = segmenter.fit_transform(x_val_pred, x_val_pre)
Beispiel #8
0
    def __init__(self, protein=None, n_jobs=-1, version='linear',
                 depth_protein=5, depth_ligand=1, size=65536):
        """PLECscore - a novel scoring function based on PLEC fingerprints. The
        underlying model can be one of:
            * linear regression
            * neural network (dense, 200x200x200)
            * random forest (100 trees)
        The scoring function is trained on PDBbind v2016 database and even with
        linear model outperforms other machine-learning ones in terms of Pearson
        correlation coefficient on "core set". For details see PLEC publication.
        PLECscore predicts binding affinity (pKi/d).

        .. versionadded:: 0.6

        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: str (default='linear')
            A version of scoring function ('linear', 'nn' or 'rf') - which
            model should be used for the scoring function.

        depth_protein: int (default=5)
            The depth of ECFP environments generated on the protein side of
            interaction. By default 6 (0 to 5) environments are generated.

        depth_ligand: int (default=1)
            The depth of ECFP environments generated on the ligand side of
            interaction. By default 2 (0 to 1) environments are generated.

        size: int (default=65536)
            The final size of a folded PLEC fingerprint. This setting is not
            used to limit the data encoded in PLEC fingerprint (for that
            tune the depths), but only the final lenght. Setting it to too
            low value will lead to many collisions.

        """

        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.depth_protein = depth_protein
        self.depth_ligand = depth_ligand
        self.size = size

        plec_func = partial(PLEC,
                            depth_ligand=depth_ligand,
                            depth_protein=depth_protein,
                            size=size,
                            count_bits=True,
                            sparse=True,
                            ignore_hoh=True)
        descriptors = universal_descriptor(plec_func, protein=protein,
                                           shape=size, sparse=True)

        if version == 'linear':
            # avoid deprecation warnings
            kwargs = {'fit_intercept': False,
                      'loss': 'huber',
                      'penalty': 'elasticnet',
                      'random_state': 0,
                      'verbose': 0,
                      'alpha': 1e-4,
                      'epsilon': 1e-1,
                      }
            if sklearn_version >= '0.19':
                kwargs['max_iter'] = 100
            else:
                kwargs['n_iter'] = 100
            model = SGDRegressor(**kwargs)
        elif version == 'nn':
            model = MLPRegressor((200, 200, 200),
                                 batch_size=10,
                                 random_state=0,
                                 verbose=0,
                                 solver='lbfgs')
        elif version == 'rf':
            model = RandomForestRegressor(n_estimators=100,
                                          n_jobs=n_jobs,
                                          verbose=0,
                                          oob_score=True,
                                          random_state=0)
        else:
            raise ValueError('The version "%s" is not supported by PLECscore'
                             % version)

        super(PLECscore, self).__init__(model, descriptors,
                                        score_title='PLEC%s_p%i_l%i_s%i' %
                                        (version, depth_protein, depth_ligand,
                                         size))
import collections
from multiprocessing import Pool, Process, cpu_count

# Libraries providing estimators
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.svm import SVR
from .polynomial_regression import PolynomialRegression, OddDegPolynomialRegression
from sklearn.neural_network import MLPRegressor

# Clerical
from .utilities import *
from .encoder import Encoder

dict_estimators = {}
dict_estimators['LinearRegression'] = LinearRegression()
dict_estimators['SGDRegressor'] = SGDRegressor(loss='squared_loss')
dict_estimators['SVR'] = SVR()
dict_estimators['PolynomialRegression'] = PolynomialRegression(max_degree=3)
dict_estimators['Perceptron'] = MLPRegressor(max_iter=150,
                                             hidden_layer_sizes=(10, 5))
dict_estimators['CESAMORegression'] = OddDegPolynomialRegression(max_degree=11)


class SimplePPEncoder(Encoder):
    """ Samples randomly 600 sets of codes (can be changed with self.sampling_size), 
        encodes with best found 
    """
    def __init__(self,
                 estimator_name='PolynomialRegression',
                 num_predictors=2,
                 sample_size=600,
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名: ridge_regression.py

import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

__author__ = 'yasaka'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
"""
ridge_reg = Ridge(alpha=1, solver='sag')
ridge_reg.fit(X, y)
print(ridge_reg.predict(1.5))
print(ridge_reg.intercept_)
print(ridge_reg.coef_)
"""
sgd_reg = SGDRegressor(penalty='l2', n_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
print("W0=", sgd_reg.intercept_)
print("W1=", sgd_reg.coef_)

 def sgd(self):
     from sklearn.linear_model import SGDRegressor
     sgd = SGDRegressor()
     return sgd.fit(self.X, self.y)
 def peakmem_fit(self, *args):
     sgd_reg = SGDRegressor(max_iter=2000, tol=1e-16)
     sgd_reg.fit(self.X, self.y)
Beispiel #13
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
Beispiel #14
0
    datetime.date(2018, 10, 8),
    datetime.date(2019, 10, 14),
    datetime.date(2020, 10, 12),
    datetime.date(2018, 11, 11),
    datetime.date(2019, 11, 11),
    datetime.date(2020, 11, 11),
    datetime.date(2018, 11, 22),
    datetime.date(2019, 11, 28),
    datetime.date(2020, 11, 26),
    datetime.date(2018, 12, 25),
    datetime.date(2019, 12, 25),
    datetime.date(2020, 12, 25)
]

# Notre modèle de prédiction sauvegardé dans un fichier
predictionModel = SGDRegressor()

encoder = OneHotEncoder()
scaler = StandardScaler()

error_info = ''


def init(model_file='data/flights_delays_model.pkl',
         trip_distance_file='data/tripDistance.pkl',
         airport_code_file='data/airportCodesDF.pkl',
         encoder_file='data/categ_featuresEncoder.pkl',
         scaler_file='data/numfeaturesScaler.pkl'):
    global predictionModel, tripDistances, airport_codes, encoder, scaler
    predictionModel = joblib.load(model_file)
    pkl_file = open(trip_distance_file, 'rb')
Beispiel #15
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名: elastic_net.py

import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

__author__ = 'yasaka'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15)
elastic_net.fit(X, y)
print(elastic_net.predict(1.5))

sgd_reg = SGDRegressor(penalty='elasticnet', n_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
Beispiel #16
0
    #data = data["Returns"][0:-1]
    return data 


googData = readFile(googFile)
nasdaqData = readFile(nasdaqFile)
xomData = readFile(xomFile)


googData.head()
nasdaqData.head()


from sklearn.linear_model import SGDRegressor,LinearRegression

regressor = SGDRegressor(eta0=0.1,n_iter=100000,fit_intercept=False)

xData = nasdaqData["Returns"][0:-1].reshape(-1,1)
yData = googData["Returns"][0:-1]
goodGoogModel = regressor.fit(xData,yData)

    

goodGoogModel = regressor.fit(xData,yData)

goodGoogModel.score(xData,yData)


regressor.coef_

goodGoogModel.predict(np.array([-0.1,0.05]).reshape(-1,1))
Beispiel #17
0
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)

plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.title('SGD')
plt.axis([0, 2, 0, 15])
plt.show()

#%% SGD SCikit-Learn

params = []
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X, y)
params.append(sgd_reg.intercept_)
params.append(sgd_reg.coef_)

#%% Mini Batch Gradient Descendent

theta_path_mgd = []
n_iterations = 50
minibatch_size = 20
theta = np.random.randn(2, 1)  # random initialization

t0, t1 = 10, 1000


def learning_schedule(t):
def main():
    start_time = time.time()

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)

    nrow_train = train.shape[0]
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name = cv.fit_transform(merge['name'])
    print('[{}] Count vectorize `name` completed.'.format(time.time() -
                                                          start_time))

    cv = CountVectorizer()
    X_category1 = cv.fit_transform(merge['general_cat'])
    X_category2 = cv.fit_transform(merge['subcat_1'])
    X_category3 = cv.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')
    X_description = tv.fit_transform(merge['item_description'])
    print('[{}] TFIDF vectorize `item_description` completed.'.format(
        time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()
    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]

    model = Ridge(solver="sag", fit_intercept=True, random_state=205)
    model.fit(X, y)
    print('[{}] Train ridge completed'.format(time.time() - start_time))
    predsR = model.predict(X=X_test)
    print('[{}] Predict ridge completed'.format(time.time() - start_time))

    model = SGDRegressor(alpha=0.000001,
                         penalty='l2',
                         l1_ratio=0.15,
                         learning_rate='invscaling',
                         loss='squared_loss',
                         power_t=0.25,
                         random_state=None,
                         shuffle=True,
                         tol=None,
                         verbose=0,
                         warm_start=False,
                         average=False,
                         epsilon=0.1,
                         eta0=0.01,
                         fit_intercept=True)
    model.fit(X, y).sparsify()
    print('[{}] Train sgd completed'.format(time.time() - start_time))
    predsS = model.predict(X=X_test)
    print('[{}] Predict sgd completed.'.format(time.time() - start_time))

    train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                          y,
                                                          test_size=0.05,
                                                          random_state=144)
    d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
    d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
    watchlist = [d_train, d_valid]

    params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    params2 = {
        'learning_rate': 0.85,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 50,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    model = lgb.train(params, train_set=d_train, num_boost_round=10000, valid_sets=watchlist, \
    early_stopping_rounds=1000, verbose_eval=1000)
    predsL = model.predict(X_test)

    print('[{}] Predict lgb 1 completed.'.format(time.time() - start_time))

    train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=101)
    d_train2 = lgb.Dataset(train_X2, label=train_y2, max_bin=8192)
    d_valid2 = lgb.Dataset(valid_X2, label=valid_y2, max_bin=8192)
    watchlist2 = [d_train2, d_valid2]

    model = lgb.train(params2, train_set=d_train2, num_boost_round=5000, valid_sets=watchlist2, \
    early_stopping_rounds=50, verbose_eval=500)
    predsL2 = model.predict(X_test)

    print('[{}] Predict lgb 2 completed.'.format(time.time() - start_time))

    preds = predsR * 0.35 + predsS * 0.1 + predsL * 0.45 + predsL2 * 0.1

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_ridge_sgd_2xlgbm.csv", index=False)
Beispiel #19
0
def train_model(features, targets, *params):
    model = SGDRegressor()
    model.fit(features, targets)
    return model
Beispiel #20
0
import numpy as np
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train.reshape(-1, 1))
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test.reshape(-1, 1))
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train, y_train.ravel(), cv=5)
print('Cross validation r-squared scores: %s' % scores)
print('Average cross validation r-squared score: %s' % np.mean(scores))
regressor.fit(X_train, y_train.ravel())
print('Test set r-squared score %s' % regressor.score(X_test, y_test))
Beispiel #21
0
def train(features, labels):
    lr = SGDRegressor()
    lr.fit(features, labels)
    weights = lr.coef_
    return weights
Beispiel #22
0
y_test = ss_y.transform(y_test.reshape(-1, 1))

# 从sklearn.linear_model导入LinearRegression.
from sklearn.linear_model import LinearRegression

# 使用默认配置初始化线性回归器LinearRegression
lr = LinearRegression()
# 使用训练数据进行参数估计。
lr.fit(X_train, y_train)
# 对测试数据进行回归预测。
lr_y_predict = lr.predict(X_test)

# 从sklearn.linear_model导入SGDRegression
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
sgdr_y_predict = sgdr.predict(X_test)
"""
使用三种回归评价机制以及两种调用R-squared评价模块的方法,对本节模型的回归性能做出评价。
"""
# 使用LinearRegression模型自带的评估模块,并输出评价结果。
print('The value of default measurement of LinearRegression is',
      lr.score(X_test, y_test))

# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# 使用r2_score模块,并输出评估结果
print('The value of R-squared of LinearRegression is',
      r2_score(y_test, lr_y_predict))
user_avg = np.zeros(nusers)
user_std = np.zeros(nusers)
for i in range(0, nusers, batch_size):
    users_current = R_u[i:min(i + batch_size, nusers), :]
    batch_avg = ((users_current.sum(axis=1).flatten()) /
                 users_current.getnnz(axis=1))
    user_avg[i:min(i + batch_size, nusers)] = batch_avg
    user_std[i:min(i + batch_size, nusers)] = np.sqrt(
        abs(
            users_current.power(2).sum(axis=1).flatten() /
            users_current.getnnz(axis=1) - batch_avg))
print 'done avging', movie_avg, user_avg

# sgd fitter
lin_model = SGDRegressor()

rat_num = len(probe_ratings)

for i in range(0, rat_num, batch_size):

    given = probe_ratings[i:min(i + batch_size, rat_num)]
    u_mean = user_avg[probe_users[i:min(i + batch_size, probe_num)]]
    m_mean = movie_avg[probe_movies[i:min(i + batch_size, probe_num)]]
    u_mean = np.array([u_mean]).T
    m_mean = np.array([m_mean]).T

    preding = np.concatenate((u_mean, m_mean), axis=1)

    lin_model.partial_fit(preding, given)
# Model configuration

base = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=0.1)),
    MaxAbsScaler(),
    StackingEstimator(estimator=RidgeCV()),
    Normalizer(norm="l2"),
    StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    MinMaxScaler(),    
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=SGDRegressor()),
    RobustScaler(),
    StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.001)),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=6, n_estimators=100, nthread=1, objective="reg:squarederror", subsample=0.6500000000000001)),
    MinMaxScaler(),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.2, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    MaxAbsScaler(),
    RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=1, min_samples_split=4, n_estimators=100)
)

parameters = {'base_estimator': base,
              'n_estimators': 100,          #default = 50
              'learning_rate': 0.3,         #default = 1.0
              'loss': 'linear',
Beispiel #25
0
#Borramos las 5 primeras columnas pues no aportan información
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    shuffle=True)

#Preprocesado de los datos
preproc = [("missing", SimpleImputer()), ("var", VarianceThreshold(0.01)),
           ("poly", PolynomialFeatures(1)), ("standardize", StandardScaler())]

pipe = Pipeline(preproc + [('model', SGDRegressor())])

params_grid = [{
    "model": [SGDRegressor(max_iter=500)],
    "model__loss": [
        'huber', 'squared_loss', 'epsilon_insensitive',
        'squared_epsilon_insensitive'
    ],
    "model__penalty": ['l1', 'l2'],
    "model__alpha":
    np.logspace(-5, 5, 5),
    "poly__degree": [1, 2]
}, {
    "model": [LinearRegression()],
    "poly__degree": [1, 2]
}, {
Beispiel #26
0
    train_mse = mean_squared_error(y_true=y_train, y_pred=y_train_pred)
    train_rmse = np.sqrt(train_mse)
    print('Linear Regression Train RMSE:', train_rmse)
    train_r2 = r2_score(y_train, y_train_pred)
    print('Linear Regression Train R^2:', train_r2)

    # 테스트 세트의 예측값
    y_test_pred = lin_reg.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_test_pred)
    print('Linear Regression Test RMSE:', test_rmse)
    print('Linear Regression Test R^2:', test_r2)

    # LinearRegression vs SGDRegressor
    sgd_reg = SGDRegressor(random_state=1)  # 모델 생성
    sgd_reg.fit(X_train, y_train)  # 모델 훈련
    y_train_pred = sgd_reg.predict(X_train)  # 학습 세트 예측값
    # -> 학습 세트의 RMSE, R2-score
    y_test_pred = sgd_reg.predict(X_test)  # 테스트 세트 예측값
    # -> 테스트 세트의 RMSE, R2-Score

    # Scaler 사용 -> Pipeline
    pipe1 = Pipeline([('scaler', StandardScaler()),
                      ('regressor', LinearRegression())])
    pipe1.fit(X_train, y_train)  # 학습
    y_train_pred = pipe1.predict(X_train)  # Train 예측값
    # -> Train RMSE, R2-score
    y_test_pred = pipe1.predict(X_test)  # Test 예측값

    scaler = StandardScaler()
Beispiel #27
0
    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test,
                                                       y_train, y_test)

    #print(X_train)
    #X_values = np.delete(raw_data, raw_data.shape[1]-1, 1)
    #Y_values = raw_data[:,raw_data.shape[1]-1]

    weights_sk = np.full(
        (1, X_train.shape[1]), 1.0
    )  #do not reuse the weights since sk-learn does inplace work with the coef_init matrix!
    intercept_sk = 1
    weights_own = np.full((1, X_train.shape[1]), 1.0)
    intercept_own = 1

    sk_gdc = SGDRegressor()
    sk_gdc.fit(
        X_train, y_train, coef_init=weights_sk, intercept_init=intercept_sk
    )  #coef_init is the same as our weights for comparison reasons (sklear does not pass w_0!)
    print("Weights and intercept found by sk:", weights_sk, intercept_sk)

    own_gdc = OwnGradientDescentRegressor(debug_output=True)
    print(weights_own, weights_own.shape)
    weights_own, intercept_own = own_gdc.fit(X_train,
                                             y_train,
                                             coef_init=weights_own,
                                             intercept_init=intercept_own)
    print("Weights and intercept found by own:", weights_own, intercept_own)

    print("Prediction with sk-learn:", sk_gdc.predict(X_test))
    print("Prediction with own-imp:", own_gdc.predict(X_test))
Beispiel #28
0
def run_sgd_reg():
    sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
    return sgd_reg.fit(X_train_scaled, y_train)
Beispiel #29
0
    # print(appIndex, score, timerEnd-timerStart)

    return prediction


# initialize vars -------------------------------------------------------------

# define scaler for normalization
scaler = StandardScaler()

# define PCA for dimentionality reduction
pca = IncrementalPCA()

# define regression model
regressor = SGDRegressor(random_state=42, max_iter=1, tol=1e-3)
regressors, predictions = None, None

# load training data one batch at a time
trainFiles = glob(DATA_SUBFOLDER + '/trainBatch*')
timerStartLocal = time.time()
for fileNum, file in enumerate(trainFiles):

    # load batch
    timerStartLocal2 = time.time()
    data, labels = load(file)

    #init scaler and pca
    scaler.partial_fit(data)
    # pca.partial_fit(data)
print(' | Modelo               | Media $R^2$ cv | Desviación típica cv | Media coeficientes | Desv. coef | Intervalo coeficientes   | t.ajuste | tiempo vc |    ')
print('|:--------------------:|:--------------:|:--------------------:|:------------------:|:----------:|--------------------------|----------|:---------:|     ')
'''
cnt = 0  # contado de número de algoritmos lanzados
ajustes = list()

for a in alphas:
    for algoritmo in algoritmos:
        for penalizacion in penalizaciones:
            for aprendizaje in tasa_aprendizaje:
                Separador()

                SGD_REGRESSOR = SGDRegressor(alpha=a,
                                             max_iter=ITERACION_MAXIMAS,
                                             eta0=eta,
                                             learning_rate=aprendizaje,
                                             penalty=penalizacion,
                                             loss=algoritmo,
                                             shuffle=True,
                                             early_stopping=True)

                titulo = str(f'\n___SGD regresión ({cnt})___\n' +
                             'algoritmo: ' + algoritmo + '\n' +
                             'penalización: ' + penalizacion + '\n' +
                             'aprendizaje: ' + aprendizaje + '\n' + 'eta: ' +
                             str(eta) + '\n' + 'alpha: ' + str(a) + '\n')

                sgd = Evaluacion(SGD_REGRESSOR,
                                 x_train,
                                 y_train,
                                 k_folds,
                                 titulo,