Beispiel #1
0
    def objective(**params):
        reg = ElasticNet(max_iter=1000, normalize=False)
        reg.set_params(**params)
        cval = cross_val_score(reg, x, y, scoring='r2', cv=3)
        cval[np.where(cval < 0)[0]] = 0

        return -cval.mean()
Beispiel #2
0
def fun_en_fs(x, *args):
    X, y, flag, n_splits, random_seed = args
    n_samples, n_var = X.shape
    clf = ElasticNet(random_state=random_seed,
                     max_iter=10000)  #TODO: max_iter alterado
    p = {'alpha': x[0], 'l1_ratio': x[1], 'positive': x[2] < 0.5}
    clf.set_params(**p)

    if len(x) <= 3:
        ft = np.array([1 for i in range(n_var)])
        ft = np.where(ft > 0.5)
    else:
        ft = np.array([1 if k > 0.5 else 0 for k in x[2::]])
        ft = np.where(ft > 0.5)

    #x[4::] = [1 if k>0.5 else 0 for k in x[4::]]
    #ft = np.array([1 if k>0.5 else 0 for k in x[4::]])
    #ft = np.where(ft>0.5)
    try:
        #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
        #cv=KFold(n=n_samples, n_folds=5, shuffle=True, random_state=int(random_seed))
        cv = KFold(n_splits=n_splits,
                   shuffle=True,
                   random_state=int(random_seed))
        y_p = cross_val_predict(clf, X[:, ft].squeeze(), y, cv=cv, n_jobs=1)

        r = RMSE(y_p, y)
        r2 = MAPE(y_p, y)
        r3 = RRMSE(y_p, y)
        r4 = -r2_score(y_p, y)
        #r =  mean_squared_error(y,y_p)**0.5
        #r =  -accuracy_score(y,y_p)
        #r =  -f1_score(y,y_p,average='weighted')
    except:
        y_p = [None]
        r = 1e12

    #print(r,'\t',p)
    if flag == 'eval':
        return r
    else:
        clf.fit(X[:, ft].squeeze(), y)
        return {
            'Y_TRUE': y,
            'Y_PRED': y_p,
            'EST_PARAMS': p,
            'PARAMS': x,
            'EST_NAME': 'EN',
            'ESTIMATOR': clf,
            'ACTIVE_VAR': ft,
            'DATA': X,
            'SEED': random_seed,
            'ERROR_TRAIN': {
                'RMSE': r,
                'MAPE': r2,
                'RRMSE': r3,
                'R2_SCORE': r4
            }
        }
Beispiel #3
0
def elasticnet_coefs(X, Y, alphas):
    coefs = []
    enet_reg = ElasticNet()
    for a in alphas:
        enet_reg.set_params(alpha=a, l1_ratio=0.05)
        enet_reg.fit(X, Y)
        coefs.append(enet_reg.coef_)
    return coefs
class boroReg:
    def __init__(self, X, y, idx, pipe_X, pipe_y):
        self.X = X[idx, :]  # shift to fix 1 indexing using np broadcasting
        self.y = y[idx, :]
        self._gridSearch = None
        self.pipeline_X = pipe_X
        self.pipeline_y = pipe_y
        self._searchSpace = None
        self._params = None
        self.lm = ElasticNet()

    def __imputeVals(self, in_df):
        return imputeVals(in_df)

    def gridSearch(self, params, cv=5, njobs=-1, verbose=50):
        self._searchSpace = params

        self._gridSearch = GridSearchCV(self.lm,
                                        params,
                                        cv=cv,
                                        scoring="neg_mean_squared_error",
                                        n_jobs=njobs,
                                        verbose=verbose)
        self._gridSearch.fit(self.X, self.y)

    def getBestParams(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_params_
        else:
            raise ValueError()

    def getBestScore(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_score_
        else:
            raise ValueError()

    def fitModel(self, params):
        self._params = params

        self.lm.set_params(**params)
        self.lm.fit(self.X, self.y)

    def __invert(self, y):
        return np.exp(self.pipeline_y.inverse_transform(y))

    def getTrainScore(self):
        return self.lm.score(self.X, self.y)

    def predict(self, test_X):
        piped_X = self.pipeline_X.transform(self.__imputeVals(test_X))
        preds = self.lm.predict(piped_X)
        return self.__invert(preds)
class ElasticNetModel(Model):
    def create_model(self):
        self.elastic_net = ElasticNet()

    def fit(self, train_x, train_y):
        self.elastic_net.fit(train_x, train_y)

    def set_config(self, config):
        self.elastic_net.set_params(**config)

    def predict(self, test_x):
        return self.elastic_net.predict(test_x)
Beispiel #6
0
def main():

    list_file_path = sorted(glob.glob(os.path.join(DATA_DIR, 'train_all_join_4/*gz')))

    df = pandas.read_csv(list_file_path[0], compression='gzip')
    df = df.fillna(0)
    data = df[LIST_FEATURE_COLUMN_NAME].values
    target = df[TARGET_COLUMN_NAME].values

    model = ElasticNet(random_state=0,
                       alpha=0.001,
                       # warm_start=True,
                       max_iter=1000)

    params = {'alpha': [0.0001, 0.001, 0.01],
              'l1_ratio': [0.2, 0.5, 0.7]}
    cv = GridSearchCV(model, params, scoring=bimbo_scoring, n_jobs=1, refit=False, verbose=10)
    cv.fit(data, target)
    logger.info('best_params: %s' % cv.best_params_)
    model.set_params(warm_start=True)

    for i in range(1, len(list_file_path)):
        logger.info('%s: %s' % (i, list_file_path[i]))

        test_df = pandas.read_csv(list_file_path[i], compression='gzip')
        test_df = test_df.fillna(0)
        test_data = test_df[LIST_FEATURE_COLUMN_NAME].values
        test_target = test_df[TARGET_COLUMN_NAME].values

        model.fit(data, target)

        predict = model.predict(data)
        predict = numpy.where(predict < 0, 0, predict)
        score = bimbo_score_func(predict, target)
        logger.info('INSAMPLE score: %s' % score)

        predict = model.predict(test_data)
        predict = numpy.where(predict < 0, 0, predict)
        score = bimbo_score_func(predict, test_target)
        logger.info('score: %s' % score)

        # model.set_params(n_estimators=n_estimators)

        df = test_df
        data = test_data
        target = test_target

    with open('lasso_model_4.pkl', 'wb') as f:
        pickle.dump(model, f, -1)
def test_enet_toy():
    # Test ElasticNet for various parameters of alpha and l1_ratio.
    # Actually, the parameters alpha = 0 should not be allowed. However,
    # we test it as a border case.
    # ElasticNet is tested with and without precomputed Gram matrix

    X = np.array([[-1.], [0.], [1.]])
    Y = [-1, 0, 1]       # just a straight line
    T = [[2.], [3.], [4.]]  # test sample

    # this should be the same as lasso
    clf = ElasticNet(alpha=1e-8, l1_ratio=1.0)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(pred, [2, 3, 4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100,
                     precompute=False)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
    assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
    assert_almost_equal(clf.dual_gap_, 0)

    clf.set_params(max_iter=100, precompute=True)
    clf.fit(X, Y)  # with Gram
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
    assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
    assert_almost_equal(clf.dual_gap_, 0)

    clf.set_params(max_iter=100, precompute=np.dot(X.T, X))
    clf.fit(X, Y)  # with Gram
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
    assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
    assert_almost_equal(clf.dual_gap_, 0)

    clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.45454], 3)
    assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
    assert_almost_equal(clf.dual_gap_, 0)
def init_model(model="", parameters={}):
    new_params = {}
    if model == "elastic_net":
        regressor = ElasticNet()
    elif model == "sgd_regressor":
        regressor = SGDRegressor()
    elif model == "ridge":
        regressor = Ridge()
    elif model == 'neural_network':
        return neural_network(parameters)
    else:
        regressor = ElasticNet()
    # get all available parameters
    available_params = set(regressor.get_params().keys()).intersection(
        set(parameters.keys()))
    params = {a_p: parameters[a_p] for a_p in available_params}
    regressor.set_params(**params)
    return regressor
Beispiel #9
0
def test_warm_start_convergence():
    X, y, _, _ = build_dataset()
    model = ElasticNet(alpha=1e-3, tol=1e-3).fit(X, y)
    n_iter_reference = model.n_iter_

    # This dataset is not trivial enough for the model to converge in one pass.
    assert n_iter_reference > 2

    # Check that n_iter_ is invariant to multiple calls to fit
    # when warm_start=False, all else being equal.
    model.fit(X, y)
    n_iter_cold_start = model.n_iter_
    assert n_iter_cold_start == n_iter_reference

    # Fit the same model again, using a warm start: the optimizer just performs
    # a single pass before checking that it has already converged
    model.set_params(warm_start=True)
    model.fit(X, y)
    n_iter_warm_start = model.n_iter_
    assert n_iter_warm_start == 1
Beispiel #10
0
def roi_loop(X, y, params):
    alpha = params['alpha']
    l1_ratio = params['l1_ratio']
    results = pd.DataFrame(
        columns=['alpha', 'l1_ratio', 'edge', 'r2_test', 'r2_train'])
    row = 0
    for a in alpha:
        for l in l1_ratio:
            elastic = ElasticNet(normalize=True, max_iter=params['max_iter'])
            elastic.set_params(**{'alpha': a, 'l1_ratio': l})
            print('PARAMS alpha: {}, l1_ratio: {}'.format(a, l))
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=0)
            elastic.fit(X_train, y_train)
            r2_test = r2_score(y_test, elastic.predict(X_test))
            r2_train = r2_score(y_train, elastic.predict(X_train))
            results.loc[row] = [a, l, params['y'], r2_test, r2_train]
            row += 1

    results.to_csv(params['name'])
class modelEN(base_model):
    '''
    ElasticNet Regressor model with best parameters
    '''
    def __init__(self, target_range):
        self.name = 'ElasticNet'
        self.postclip = True
        self.target_range = target_range
        self.fit_cols = None  # Better to use all features
        self.trained = False

        #self.alpha, self.l1_ratio = 0.404, 1.0 #CV search, but worsens RMSE...
        self.alpha, self.l1_ratio = 1.0, 0.5

        self.max_iter = 1000
        self.en_params = {
            'max_iter': self.max_iter,
            'random_state':
            0,  # Changing this could generate ensembling options
            'alpha': self.alpha,
            'l1_ratio': self.l1_ratio,
            'tol': 0.0001,
            'fit_intercept': True,
            'normalize': False,
            'positive': False,
            'precompute': False,
            'selection': 'cyclic',
            'copy_X': True,
            'warm_start': False
        }

        self.model = ElasticNet()
        self.model = self.model.set_params(**self.en_params)

    def _raw_fit(self, X, y):
        self.model = self.model.fit(X.values, y)
        self.trained = True

    def _raw_predict(self, X):
        return self.model.predict(X.values)
100*percentnulldf[percentnulldf>0].sort_values(ascending=False)


# In[ ]:


elasticnet = ElasticNet(alpha=0.1, l1_ratio=0.5, normalize=True)
N_alpha = 100
N_rho   = 10
alphaRange = np.logspace(-10, 2, N_alpha)
rhoRange   = np.linspace(0.1,1, N_rho) # we avoid very small rho by starting at 0.1
scores     = np.zeros((N_rho, N_alpha))
prices = pd.Series(clean_train.SalePrice)
for alphaIdx, alpha in enumerate(alphaRange):
    for rhoIdx, rho in enumerate(rhoRange):
            elasticnet.set_params(alpha=alpha, l1_ratio=rho)
            elasticnet.fit(df_fin_features, prices)
            scores[rhoIdx, alphaIdx] = elasticnet.score(df_fin_features, prices)


# In[ ]:


from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


# In[ ]:


from sklearn.linear_model import Lasso
def elastic_net_regression(X_train,
                           X_test,
                           y_train,
                           y_test,
                           outputs=False,
                           plots=False):
    # storing the values of the coefficients and the corresponding value of mse for each value of lambda
    elastic_net = ElasticNet(max_iter=MAX_ITER)
    coefficients = []
    mse_training = []
    for lambda_value in LAMBDA_VALUES:
        elastic_net.set_params(alpha=lambda_value)
        elastic_net.fit(X_train, y_train)
        # storing the values for each value of lambda for the plots
        coefficients.append(elastic_net.coef_)
        mse_training.append(
            mean_squared_error(y_train, elastic_net.predict(X_train)))

    # definel model evaluation method
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

    # using cross validation to determine best value for lambda
    cv_elastic_net = ElasticNetCV(alphas=LAMBDA_VALUES, cv=cv)
    cv_elastic_net.fit(X_train, y_train)
    # extracting the optimal value of lambda corresponding to the min(MSE)
    optimal_lambda = cv_elastic_net.alpha_
    optimal_l1_ratio = cv_elastic_net.l1_ratio_

    # fitting the lasso regression model with the optimal value of lambda found with cross validation
    elastic_net_model = ElasticNet(alpha=optimal_lambda,
                                   l1_ratio=optimal_l1_ratio).fit(
                                       X_train, y_train)

    # predicting y values of the training set
    y_train_predicted = elastic_net_model.predict(X_train)
    train_set_mse = mean_squared_error(y_train, y_train_predicted)
    train_set_score = elastic_net_model.score(X_train, y_train)

    # predicting y values of the training set
    y_test_predicted = elastic_net_model.predict(X_test)
    test_set_mse = mean_squared_error(y_test, y_test_predicted)
    test_set_score = elastic_net_model.score(X_test, y_test)

    if outputs:
        # output and analysis
        print('Elastic Net regression coefficients:',
              np.round(elastic_net_model.coef_, 4))
        print('Optimal lambda:', round(optimal_lambda, 4))
        print('Training test: MSE:', round(train_set_mse, 4), ', R2:',
              round(train_set_score, 4))
        print('Test test: MSE:', round(test_set_mse, 4), ', R2:',
              round(test_set_score, 4))

    if plots:
        # plot that visualizes the coefficients getting shrinked
        ax = plt.gca()
        ax.plot(np.log(LAMBDA_VALUES), coefficients)
        plt.vlines(x=np.log(optimal_lambda),
                   ymin=np.min(coefficients),
                   ymax=np.max(coefficients),
                   linestyles='dashed',
                   color='black')
        plt.axis('tight')
        plt.xlabel('log(λ)')
        plt.ylabel('Coefficients')
        plt.title('Elastic Net parameters shrinkage')
        plt.show()

        # plot for optimal value of lambda obtained with cross validation
        plt.plot(np.log(LAMBDA_VALUES), mse_training)
        plt.vlines(x=np.log(optimal_lambda),
                   ymin=np.min(mse_training),
                   ymax=np.max(mse_training),
                   linestyles='dashed',
                   color='black')
        plt.axis('tight')
        plt.xlabel('log(λ)')
        plt.ylabel('MSE')
        plt.title("Elastic Net optimal value of λ using cross-validation")
        plt.show()

    return train_set_mse, test_set_mse
Beispiel #14
0
X = df.iloc[:, 0:8].values.tolist()
y = df.iloc[:, 8].values.tolist()

# Split train and test data
X_train, X_test = X[:n_samples_train], X[n_samples_train:]
y_train, y_test = y[:n_samples_train], y[n_samples_train:]

###############################################################################
# Compute train and test errors
alphas = np.logspace(-5, 1, 60)
enet = ElasticNet(l1_ratio=1)
train_errors = list()
test_errors = list()
for alpha in alphas:
    enet.set_params(alpha=alpha)
    enet.fit(X_train, y_train)
    train_errors.append(enet.score(X_train, y_train))
    test_errors.append(enet.score(X_test, y_test))

i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)

# Estimate the coef_ on full data with optimal regularization parameter
enet.set_params(alpha=alpha_optim)

joblib.dump(enet, "train_model.m")
###############################################################################
# Plot results functions
class linReg:
    def __init__(self, in_df):
        df = self.__imputeVals(in_df.copy())
        self.X = df.drop(columns=["SalePrice"]).copy()
        self.y = np.log(df.SalePrice.values.reshape(-1, 1))

        self._gridSearch = None
        self.pipeline_X = self.__make_pipe()
        #self.pipeline_y = StandardScaler()
        self.pipeline_y = PowerTransformer()
        self._searchSpace = None
        self._params = None
        self.lm = ElasticNet()

    def __imputeVals(self, in_df):
        return imputeVals(in_df)

    def __make_pipe(self):
        nonePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value="None"), OneHotEncoder(drop="first"))
        zeroPipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto"))
        scalePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), PowerTransformer())

        regressionPipeline = ColumnTransformer([
            ("setNone", nonePipeline, fillNone),
            ("setZero", zeroPipeline, fillZeroCat),
            ("transformed", scalePipeline, fillZeroCont),
            ("dictImputed", make_pipeline(dictImputer(imputeDict),
                                          OneHotEncoder(drop="first")), list(imputeDict.keys())),
            ("bool", "passthrough", imputeBool),
            ("categoricalInts", "passthrough", cat_to_int),
            ("dropped", "drop", dropList)
        ], remainder="drop")
        return regressionPipeline

    def gridSearch(self, params, cv=5, njobs=-1, verbose=50):
        self._searchSpace = params
        #self._params = None

        piped_X = self.pipeline_X.fit_transform(self.X)
        piped_y = self.pipeline_y.fit_transform(self.y)
        self._gridSearch = GridSearchCV(
            self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose)
        self._gridSearch.fit(piped_X, piped_y)

    def getBestParams(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_params_
        else:
            raise ValueError()

    def getBestScore(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_score_
        else:
            raise ValueError()

    def fitModel(self, params):
        piped_X = self.pipeline_X.fit_transform(self.X)
        piped_y = self.pipeline_y.fit_transform(self.y)
        self._params = params

        self.lm.set_params(**params)
        self.lm.fit(piped_X, piped_y)

    def __invert(self, y):
        return np.exp(self.pipeline_y.inverse_transform(y))

    def getTrainScore(self):
        piped_X = self.pipeline_X.transform(self.X)
        piped_y = self.pipeline_y.transform(self.y)
        return self.lm.score(piped_X, piped_y)

    # Root Mean Square Log Error
    def getRMSLE(self):
        piped_X = self.pipeline_X.transform(self.X)
        preds = self.lm.predict(piped_X).reshape(-1,1)
        preds = self.pipeline_y.inverse_transform(preds)
        return mean_squared_error(self.y,preds)

    def predict(self, test_X):
        piped_X = self.pipeline_X.transform(self.__imputeVals(test_X))
        preds = self.lm.predict(piped_X).reshape(-1,1)
        return self.__invert(preds)
Beispiel #16
0
class Victor():
    def __init__(self, bcupath='backup', cv=3, max_evals=20):
        """Initialize data"""
        subfolder = basedir.split('/')[:-1]
        self.bcupath = os.path.join('/', *subfolder, bcupath)

        #self.load_model()
        try:
            self.load_model()
        except:
            self.scaler = StandardScaler()
            self.pca = PCA()
            self.estimator = ElasticNet(random_state=0)
            self.rmsecv = 1e20
            self.trained = False
            self.cv = cv
            self.columns = ['wl_{}'.format(x) for x in range(950, 1530 + 1, 2)]
            self.max_evals = max_evals
            self.lmbda = 0

    def load_model(self):
        """Load the trained models"""
        with open(os.path.join(self.bcupath, 'misc-estimator.pkl'),
                  'rb') as handle:
            saved = pickle.load(handle)

        self.rmsecv = saved['rmsecv']
        self.trained = saved['trained']
        self.cv = saved['cv']
        self.columns = saved['columns']
        self.lmbda = saved['lmbda']
        self.max_evals = saved['max_evals']

        self.scaler = joblib.load(os.path.join(self.bcupath, 'scaler.pkl'))
        self.pca = joblib.load(os.path.join(self.bcupath, 'pca.pkl'))
        self.estimator = joblib.load(
            os.path.join(self.bcupath, 'estimator.pkl'))
        #print('loaded :', self.scaler.mean_)

    def save_model(self):
        """Save the trained model"""
        tosave = {
            'rmsecv': self.rmsecv,
            'trained': self.trained,
            'cv': self.cv,
            'columns': self.columns,
            'lmbda': self.lmbda,
            'max_evals': self.max_evals,
        }

        with open(os.path.join(self.bcupath, 'misc-estimator.pkl'),
                  'wb') as handle:
            pickle.dump(tosave, handle, protocol=pickle.HIGHEST_PROTOCOL)

        #print('saved : ', self.scaler.mean_)
        joblib.dump(self.scaler, os.path.join(self.bcupath, 'scaler.pkl'))
        joblib.dump(self.pca, os.path.join(self.bcupath, 'pca.pkl'))
        joblib.dump(self.estimator, os.path.join(self.bcupath,
                                                 'estimator.pkl'))

    def fit(self, dataset):
        """fit the model"""

        X = dataset[self.columns]
        y = dataset['target']

        ybc, self.lmbda = stats.boxcox(y)

        ## HyperOpt features
        def objective(params):
            hyperparams = {
                'alpha': params['alpha'],
                'l1_ratio': params['l1_ratio'],
                'random_state': 0,
            }

            elnet = ElasticNet(**hyperparams)
            scaler = StandardScaler()
            pca = PCA(random_state=0)

            Xscaled = scaler.fit_transform(X)
            Xpca = pca.fit_transform(Xscaled)

            preds = cross_val_predict(elnet, Xpca, ybc, cv=self.cv, n_jobs=-2)
            score = mean_squared_error(inv_boxcox(preds, self.lmbda), y)

            return score

        space = {
            'alpha': hp.loguniform('alpha', -10, 2),
            'l1_ratio': hp.loguniform('l1_ratio', -20, 0),
        }

        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals)

        ## Hyperopt best results and training
        params = {
            'alpha': best['alpha'],
            'l1_ratio': best['l1_ratio'],
            'random_state': 0,
        }

        print(params)

        self.estimator.set_params(**params)
        Xscaled = self.scaler.fit_transform(X)
        Xpca = self.pca.fit_transform(Xscaled)
        self.estimator.fit(Xpca, ybc)

        ## Performance measurement
        self.trained = True
        preds = cross_val_predict(self.estimator,
                                  Xpca,
                                  ybc,
                                  cv=self.cv,
                                  n_jobs=-2)
        self.rmsecv = mean_squared_error(inv_boxcox(preds, self.lmbda), y)**.5

        ## save the model
        self.save_model()

    def predict(self, dataset):
        """Predict from X"""
        X = dataset[self.columns]
        Xscaled = self.scaler.transform(X)
        Xpca = self.pca.transform(Xscaled)
        predsbc = self.estimator.predict(Xpca)
        return inv_boxcox(predsbc, self.lmbda)
Beispiel #17
0
lasso = make_pipeline(
    RobustScaler(), Lasso(alpha=gs_lasso.best_params_['alpha'],
                          random_state=1))
lasso.fit(x_train, y_train)
score_lasso = lasso.score(x_train, y_train)
rmse_lasso = np.sqrt(mean_squared_error(y_train, lasso.predict(x_train)))
lasso_pred = np.expm1(lasso.predict(x_test))

# ElasticNet
ElNet = ElasticNet()
para_ElNet = {
    "alpha": np.logspace(-3.8, -3.3, 10),
    "l1_ratio": np.linspace(0.7, 0.9, 10)
}
ElNet.set_params(max_iter=5000)
gs_ElNet = GridSearchCV(ElNet,
                        para_ElNet,
                        cv=10,
                        scoring='neg_mean_squared_error',
                        n_jobs=-1)
gs_ElNet.fit(x_train, y_train)
gs_ElNet.best_params_
#{'alpha': 0.0003880510732210184, 'l1_ratio': 0.9}

ElNet.set_params(alpha=gs_ElNet.best_params_['alpha'],
                 l1_ratio=gs_ElNet.best_params_['l1_ratio'])
ElNet.fit(x_train, y_train)
score_ElNet = ElNet.score(x_train, y_train)
rmse_ElNet = np.sqrt(mean_squared_error(y_train, ElNet.predict(x_train)))
ElNet_pred = np.expm1(ElNet.predict(x_test))
Beispiel #18
0
plt.plot(m_log_alphas, np.log10(model.mse_path_), ':')
plt.plot(m_log_alphas, np.log10(model.mse_path_.mean(axis=-1)), 'k', label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',label='alpha: CV estimate')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent')
plt.axis('tight')
plt.show()

elastic = ElasticNet(max_iter=10000, normalize=True, positive=True)
coefs = []

for a in alphas:
    elastic.set_params(alpha=a)
    elastic.fit(scale(X_train), y_train)
    coefs.append(elastic.coef_)
 
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')

plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.axvline(model.alpha_, linestyle='--', color='k',label='alpha: CV estimate')
plt.title('Optimal Alpha Parameters')
plt.show()
en.set_params(alphas=model.alpha_)
en.fit(X_train, y_train)
mean_squared_error(y_test, en.predict(X_test))
Beispiel #19
0
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.show()

for a in alphas:
    ElasticNet.set_params(alpha=a)
    ElasticNet.fit(scale(X_train), y_train)
    coefs.append(ElasticNet.coef_)
 

label_list=['Na Library', 'Cl Library', 'Water Library', 'Fe Library', 'Cu Library'] 
ax = plt.gca()
lineObjects = ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
plt.axvline(en.alpha_, linestyle='--', color='k',label='alpha: CV estimate')
plt.locator_params(axis='y', nbins=10)
"""

"""
plt.xlabel('Alpha')
Beispiel #20
0
# model.add(Dropout(0.2))
# model.add(LSTM(input_shape=(None,50),units=50,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(input_shape=(None,50),units=50,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(50,return_sequences=False))
# model.add(Dropout(0.2))
# model.add(Dense(units=1))
# model.add(Activation("linear"))
# start = time.time()
# model.compile(loss="mse", optimizer="rmsprop")
# print("Time : ", time.time() - start)
# #print(model.layers)

# model.fit(X_train,y_train,batch_size=50,epochs=10,validation_split=0.05)

model = ElasticNet(l1_ratio=0.5, normalize=True, max_iter=15000)
model.set_params(alpha=0.001)
model.fit(X_train, y_train)

y_test = model.predict(X_test)
prediction_result = y_test
#prediction_result = []
# for i in range(len(y_test)):
# 	prediction_result.append(y_test[i][0])
speed_id = [x for x in range(len(y_test))]
#print(type(y_test))
result = pd.DataFrame({'id': speed_id, 'speed': prediction_result})
result.to_csv('submission.csv', index=False)
print(y_test)
Beispiel #21
0
class ElasticReg(customRegressor):
    def __init__(self, in_df, zoning, utilities, frontage, qualPow):

        super(ElasticReg, self).__init__()
        from lm_features import impute_shell
        ## Because we're currying in python now
        self._imputeVals = impute_shell(frontage=frontage,
                                        zoning=zoning,
                                        utilities=utilities,
                                        qualPow=qualPow)
        tempDF = self._imputeVals(in_df.copy())
        self.X = tempDF.drop(columns=["SalePrice"]).copy()
        self.y = np.log(tempDF.SalePrice.values.reshape(-1, 1))

        self.pipeline_X = self._make_pipe()
        self.pipeline_X.fit(self.X)
        self.pipeline_y = StandardScaler()
        self.pipeline_y.fit(self.y)

    def _rmOutliers(self, x, y):
        outliers = ((y > 4000) & (y < 5E5))
        out = x[~(outliers)]

        return out

    def _make_pipe(self):
        import lm_features as f
        nonePipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value="None"),
            OneHotEncoder(drop="first"))
        zeroPipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value=0),
            OneHotEncoder(drop="first", categories="auto"))
        scalePipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value=0),
            PowerTransformer())

        regressionPipeline = ColumnTransformer(
            [("setNone", nonePipeline, f.fillNone),
             ("setZero", zeroPipeline, f.fillZeroCat),
             ("transformed", scalePipeline, f.fillZeroCont),
             ("dictImputed",
              make_pipeline(
                  self.dictImputer(f.imputeDict),
                  OneHotEncoder(drop="first")), list(f.imputeDict.keys())),
             ("bool", "passthrough", f.imputeBool),
             ("categoricalInts", "passthrough", f.cat_to_int),
             ("dropped", "drop", f.dropList)],
            remainder="drop")
        return make_pipeline(regressionPipeline, RobustScaler())

    def gridSearch(self, params, cv=5, njobs=-1, verbose=50):
        self._searchSpace = params

        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)

        self._gridSearchObject = GridSearchCV(ElasticNet(),
                                              params,
                                              cv=cv,
                                              scoring="neg_mean_squared_error",
                                              n_jobs=njobs,
                                              verbose=verbose)
        self._gridSearchObject.fit(piped_X, piped_y)

    def fitModel(self, params):
        self.model = ElasticNet()
        self._params = params

        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)

        self.model.set_params(**params)
        self.model.fit(piped_X, piped_y)

    def getTrainRsquared(self):
        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)
        return self.model.score(piped_X, piped_y)
Beispiel #22
0
print(u_test, std_test)
print(u_val, std_val)

# best alphas and l1
best_alphas = []
best_l1s = []
a_lasso = []
b_ridge = []
for p in best_params:
    best_alphas.append(p['alpha'])
    best_l1s.append(p['l1_ratio'])
    a_lasso.append(p['alpha'] * p['l1_ratio'])
    b_ridge.append(p['alpha'] - p['alpha'] * p['l1_ratio'])

# make two submissions to compare
# model 1: alpha = 0.09102981779915217, l1 = 0.035448275862068966
# equivalent to a_lasso = 0.003226850093018222, b_ridge = 0.08780296770613395

# model 2: alpha = 0.005428675439323859, l1 = 0.8277586206896551,
# equivalent to a_lasso = 0.004493632893826525, b_ridge = 0.0009350425454973344
enet.set_params(alpha=best_alphas[2], l1_ratio=best_l1s[2])
enet.fit(X, y)
cols = Xd_train.columns
coefs = sorted(list(zip(cols, enet.coef_)),
               key=lambda t: abs(t[1]),
               reverse=True)
coefs = pd.DataFrame(coefs, columns=['Feature', 'Coef'])
print(len(coefs[np.abs(coefs['Coef']) > 0]))
submissiondata = make_prediction(enet, scaler)
submissiondata.to_csv("yq_submission8_enet2.csv", index=False)