import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -0.9491441900056168
exported_pipeline = make_pipeline(
    SelectFromModel(
        estimator=ExtraTreesRegressor(max_features=0.8500000000000001,
                                      n_estimators=100),
        threshold=0.0), MaxAbsScaler(), LassoLarsCV(normalize=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
    vec = TfidfVectorizer()
    clf.fit(vec.fit_transform(docs), y)
    expl = explain_prediction(clf, docs[0], vec=vec)
    assert 'supported' in expl.error


@pytest.mark.parametrize(['reg'], [
    [ElasticNet(random_state=42)],
    [ElasticNetCV(random_state=42)],
    [HuberRegressor()],
    [Lars()],
    [LarsCV(max_n_alphas=10)],
    [Lasso(random_state=42)],
    [LassoCV(n_alphas=10)],
    [LassoLars(alpha=0.1)],
    [LassoLarsCV(max_n_alphas=10)],
    [LassoLarsIC()],
    [LinearRegression()],
    [LinearRegression(fit_intercept=False)],
    [LinearSVR(random_state=42)],
    [OrthogonalMatchingPursuit(n_nonzero_coefs=10)],
    [OrthogonalMatchingPursuitCV()],
    [PassiveAggressiveRegressor(C=0.1)],
    [Ridge(random_state=42)],
    [RidgeCV()],
    [SGDRegressor(**SGD_KWARGS)],
    [TheilSenRegressor()],
    [SVR(kernel='linear')],
    [NuSVR(kernel='linear')],
])
def test_explain_linear_regression(boston_train, reg):
Exemple #3
0
def set_learning_method(config, X_train, y_train):
    """
    Instantiates the sklearn's class corresponding to the value set in the 
    configuration file for running the learning method.
    
    TODO: use reflection to instantiate the classes
    
    @param config: configuration object
    @return: an estimator with fit() and predict() methods
    """
    estimator = None

    learning_cfg = config.get("learning", None)
    if learning_cfg:
        p = learning_cfg.get("parameters", None)
        o = learning_cfg.get("optimize", None)
        scorers = \
        set_scorer_functions(learning_cfg.get("scorer", ['mae', 'rmse']))

        method_name = learning_cfg.get("method", None)  #获取方法名
        if method_name == "SVR":
            if o:
                tune_params = set_optimization_params(o)
                print tune_params
                estimator = optimize_model(SVR(), X_train,
                                           y_train, tune_params, scorers,
                                           o.get("cv", 5),
                                           o.get("verbose", True),
                                           o.get("n_jobs", 1))

            elif p:
                estimator = SVR(C=p.get("C", 10),
                                epsilon=p.get('epsilon', 0.01),
                                kernel=p.get('kernel', 'rbf'),
                                degree=p.get('degree', 3),
                                gamma=p.get('gamma', 0.0034),
                                tol=p.get('tol', 1e-3),
                                verbose=False)
            else:
                estimator = SVR()

        elif method_name == "RF":  #RandomForest
            if o:
                tune_params = set_optimization_params(o)
                estimator = optimize_model(RandomForestRegressor(), X_train,
                                           y_train, tune_params, scorers,
                                           o.get("cv", 5),
                                           o.get("verbose", True),
                                           o.get("n_jobs", 1))
            elif p:
                pass
            else:
                estimator = RandomForestRegressor(n_estimators=200, n_jobs=-1)

        elif method_name == "GB":  #Gradient Boosting
            if o:
                pass
            elif p:
                pass
            else:
                estimator = GradientBoostingRegressor()

        elif method_name == "GP":  #GaussianProcess
            if o:
                pass
            elif p:
                pass
            else:
                estimator = GaussianProcessRegressor()

        elif method_name == "MLP":  #MLP
            if o:
                pass
            elif p:
                pass
            else:
                estimator = MLPRegressor()

        elif method_name == "Lasso":
            if o:
                tune_params = set_optimization_params(o)
                estimator = optimize_model(Lasso(), X_train,
                                           y_train, tune_params, scorers,
                                           o.get("cv", 5),
                                           o.get("verbose", True),
                                           o.get("n_jobs", 1))
            elif p:
                estimator = Lasso(alpha=p.get('alpha', 1.0))
            else:
                estimator = Lasso()

        elif method_name == "SVC":
            if o:
                tune_params = set_optimization_params(o)
                estimator = optimize_model(SVC(), X_train,
                                           y_train, tune_params, scorers,
                                           o.get('cv', 5),
                                           o.get('verbose', True),
                                           o.get('n_jobs', 1))

            elif p:
                estimator = SVC(C=p.get('C', 1.0),
                                kernel=p.get('kernel', 'rbf'),
                                degree=p.get('degree', 3),
                                gamma=p.get('gamma', 0.0),
                                coef0=p.get('coef0', 0.0),
                                tol=p.get('tol', 1e-3),
                                verbose=p.get('verbose', False))
            else:
                estimator = SVC()

        elif method_name == "LassoCV":
            if p:
                estimator = LassoCV(eps=p.get('eps', 1e-3),
                                    n_alphas=p.get('n_alphas', 100),
                                    normalize=p.get('normalize', False),
                                    precompute=p.get('precompute', 'auto'),
                                    max_iter=p.get('max_iter', 1000),
                                    tol=p.get('tol', 1e-4),
                                    cv=p.get('cv', 10),
                                    verbose=False)
            else:
                estimator = LassoCV()

        elif method_name == "LassoLars":
            if o:
                tune_params = set_optimization_params(o)
                estimator = optimize_model(LassoLars(), X_train,
                                           y_train, tune_params, scorers,
                                           o.get("cv", 5),
                                           o.get("verbose", True),
                                           o.get("n_jobs", 1))

            if p:
                estimator = LassoLars(alpha=p.get('alpha', 1.0),
                                      fit_intercept=p.get(
                                          'fit_intercept', True),
                                      verbose=p.get('verbose', False),
                                      normalize=p.get('normalize', True),
                                      max_iter=p.get('max_iter', 500),
                                      fit_path=p.get('fit_path', True))
            else:
                estimator = LassoLars()

        elif method_name == "LassoLarsCV":
            if p:
                estimator = LassoLarsCV(max_iter=p.get('max_iter', 500),
                                        normalize=p.get('normalize', True),
                                        max_n_alphas=p.get(
                                            'max_n_alphas', 1000),
                                        n_jobs=p.get('n_jobs', 1),
                                        cv=p.get('cv', 10),
                                        verbose=False)
            else:
                estimator = LassoLarsCV()

    return estimator, scorers
def get_regression_scores(X_train, X_test, Y_train, Y_test):
    pipelines = []
    pipelines.append(('ScaledLR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LR', LinearRegression())])))
    pipelines.append(('ScaledRIDGE',
                      Pipeline([('Scaler', StandardScaler()),
                                ('RIDGE', Ridge())])))
    pipelines.append(('ScaledLASSO',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LASSO', Lasso())])))
    pipelines.append(('ScaledLASSOCV',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LASSOCV', LassoCV())])))
    pipelines.append(('ScaledLASSOLarsCV',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LASSOLarsCV', LassoLarsCV())])))
    pipelines.append(('ScaledEN',
                      Pipeline([('Scaler', StandardScaler()),
                                ('EN', ElasticNet())])))
    pipelines.append(('ScaledBAYESIAN',
                      Pipeline([('Scaler', StandardScaler()),
                                ('BAYESIAN', BayesianRidge())])))
    pipelines.append(('ScaledKNN',
                      Pipeline([('Scaler', StandardScaler()),
                                ('KNN', KNeighborsRegressor())])))
    pipelines.append(('ScaledCART',
                      Pipeline([('Scaler', StandardScaler()),
                                ('CART', DecisionTreeRegressor())])))
    pipelines.append(('ScaledGBM',
                      Pipeline([('Scaler', StandardScaler()),
                                ('GBM', GradientBoostingRegressor())])))

    results = []
    names = []

    for name, model in pipelines:
        ts = time.time()
        kfold = KFold(n_splits=10, random_state=21)
        cv_results = cross_val_score(model,
                                     X_train,
                                     Y_train,
                                     cv=kfold,
                                     scoring='neg_mean_squared_error')
        cv_results_abs = cross_val_score(model,
                                         X_train,
                                         Y_train,
                                         cv=kfold,
                                         scoring='neg_mean_absolute_error')
        # cv_results_sq_log = cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'neg_mean_squared_log_error')
        cv_results_median_abs = cross_val_score(
            model,
            X_train,
            Y_train,
            cv=kfold,
            scoring='neg_median_absolute_error')
        cv_r2 = cross_val_score(model,
                                X_train,
                                Y_train,
                                cv=kfold,
                                scoring='r2')
        cv_explained_variance = cross_val_score(model,
                                                X_train,
                                                Y_train,
                                                cv=kfold,
                                                scoring='explained_variance')
        ts_2 = time.time()
        results.append(cv_results)
        names.append(name)
        msg = "%f (%f)" % (cv_results.mean(), cv_results.std())
        msg_abs = "%f (%f)" % (cv_results_abs.mean(), cv_results_abs.std())
        # # msg_sq_log = "%f (%f)" % (cv_results_sq_log.mean(), cv_results_sq_log.std())
        msg_median_abs = "%f (%f)" % (cv_results_median_abs.mean(),
                                      cv_results_median_abs.std())
        msg_r2 = "%f (%f)" % (cv_r2.mean(), cv_r2.std())
        msg_explained_variance = "%f (%f)" % (cv_explained_variance.mean(),
                                              cv_explained_variance.std())
        print(name)
        print(msg_explained_variance)
        print(msg_abs)
        print(msg)
        # print(msg_sq_log)
        print(msg_median_abs)
        print(msg_r2)
        print("%f" % (ts_2 - ts))
        print('\n')
Exemple #5
0
predictors['DMARRIED0'] = preprocessing.scale(
    predictors['DMARRIED0'].astype('float64'))
predictors['DMARRIED1'] = preprocessing.scale(
    predictors['DMARRIED1'].astype('float64'))
predictors['DUNCOV0'] = preprocessing.scale(
    predictors['DUNCOV0'].astype('float64'))
predictors['DUNCOV1'] = preprocessing.scale(
    predictors['DUNCOV1'].astype('float64'))

# split data into train and test sets
pred_train, pred_test, resp_train, resp_test = train_test_split(
    predictors, target, test_size=.3, random_state=123)

# specify the lasso regression model
# precompute=True helpful for large data sets
model = LassoLarsCV(cv=10, precompute=True).fit(pred_train, resp_train)

# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()  # set up axes
plt.plot(m_log_alphas, model.coef_path_.T
         )  # alpha on x axis, change in regression coefficients on y axis
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
Exemple #6
0
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-109.53604510235976
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True,
                                                        max_features=0.3,
                                                        min_samples_leaf=11,
                                                        min_samples_split=18,
                                                        n_estimators=100)),
        make_pipeline(
            make_union(
                make_union(
                    make_union(
                        StackingEstimator(estimator=make_pipeline(
                            StandardScaler(),
                            SelectPercentile(score_func=f_regression,
                                             percentile=20), MaxAbsScaler(),
                            RidgeCV())), FunctionTransformer(copy)),
                    FunctionTransformer(copy)), StandardScaler()),
            MaxAbsScaler(), StackingEstimator(estimator=RidgeCV()),
            PolynomialFeatures(degree=2,
                               include_bias=False,
                               interaction_only=False))),
    LassoLarsCV(normalize=False))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
    ivars2 = []
    depvars = []
    columns = []

    for pyear in player_years:
        ivars.append([pt_projs[pyear][system] for system in proj_systems])
        depvars.append(pt_actuals[pyear]['actual'])

    for pyear in pt_projs_curr.keys():
        ivars2.append(
            [pt_projs_curr[pyear][system] for system in proj_systems])

    x = numpy.array(ivars)
    x2 = numpy.array(ivars2)
    y = numpy.array(depvars)
    model_pt = LassoLarsCV(cv=cv_num)
    model_pt.fit(x, y)

    print("Rough PT model, to choose sample")
    for system, coef in zip(proj_systems, model_pt.coef_):
        print("%40s : %f" % (system, coef))
    print("%40s : %f" % ('intercept', model_pt.intercept_))

    sample_proj_pt_arr = model_pt.predict(x)

    curr_proj_pt_arr = model_pt.predict(x2)

    sample_proj_pt = dict(zip(player_years, sample_proj_pt_arr))
    curr_proj_pt = dict(zip(pt_projs_curr.keys(), curr_proj_pt_arr))

    models = {}
Exemple #8
0
def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
    """Build a vector reprensation of a pair of signatures."""
    if w2v == 'glove':
        PairVecTransformer = PairGloveTransformer
    elif w2v == 'spacy':
        PairVecTransformer = PairSpacyVecTransformer
    elif w2v == 'polyglot':
        PairVecTransformer = PairPolyglotVecTransformer
    else:
        print('error passing w2v argument value')

    if PoS == 'polyglot':
        get_nouns = polyglot_nouns
        get_verbs = polyglot_verbs
        get_words = polyglot_words
        get_particle = polyglot_particle
        get_interjection = polyglot_interjection
        get_symbol = polyglot_symbol
        get_numbers = polyglot_numbers
        get_proper_nouns = polyglot_proper_nouns
        get_pronouns = polyglot_pronouns
        get_auxiliary_verbs = polyglot_auxiliary_verbs
        get_adjectives = polyglot_adjectives
        get_adverbs = polyglot_adverbs
        get_punctuation = polyglot_punctuation
        get_determiner = polyglot_determiner
        get_coordinating_conjunction = polyglot_coordinating_conjunction
        get_adpositions = polyglot_adpositions
        get_others = polyglot_others
        get_subordinating_conjunctions = polyglot_subordinating_conjunctions
    elif PoS == 'spacy':
        get_nouns = spacy_noun
        get_verbs = spacy_verb
        get_words = spacy_tokens
        get_particle = spacy_part
        get_interjection = spacy_intj
        get_symbol = spacy_sym
        get_numbers = spacy_num
        get_proper_nouns = spacy_propn
        get_pronouns = spacy_pron
        get_auxiliary_verbs = spacy_aux
        get_adjectives = spacy_adj
        get_adverbs = spacy_adv
        get_punctuation = spacy_punct
        get_determiner = spacy_det
        get_coordinating_conjunction = spacy_conj
        get_adpositions = spacy_adp
        get_others = spacy_x
        get_subordinating_conjunctions = spacy_sconj
    else:
        print('error passing PoS argument value')

    transformer = FeatureUnion([
        ("get_nouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_nouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_verbs",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_verbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_words",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_words),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_particle",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_particle),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_interjection",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_interjection),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_symbol",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_symbol),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("num_diff",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=Pipeline([
                  ("rsn", FuncTransformer(func=replace_spelled_numbers)),
                  ("get_num", FuncTransformer(func=get_numbers)),
                  ("to_num", FuncTransformer(func=to_numeric)),
              ]),
                              groupby=None)),
             ('1st_nm_comb', NumCombiner()),
         ])),
        ("get_proper_nouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_proper_nouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_pronouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_pronouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_auxiliary_verbs",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_auxiliary_verbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("adjectives_glove",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adjectives),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("adverbs_glove",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adverbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_punctuation",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_punctuation),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_determiner",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_determiner),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_coordinating_conjunction",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_coordinating_conjunction),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_adpositions",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adpositions),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_others",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_others),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_subordinating_conjunctions",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_subordinating_conjunctions),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_eol",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_eol),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_space",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_space),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_organizations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_organizations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_persons",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_persons),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_locations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_locations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_groups",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_groups),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_facilities",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_facilities),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_geo_locations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_geo_locations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_products",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_products),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_events",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_events),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_work_of_arts",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_work_of_arts),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_laws",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_laws),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_languages",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_languages),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("sent_tfidf",
         Pipeline([
             ("pairs",
              PairTransformer(element_transformer=Pipeline(
                  [("1st_verb",
                    FuncTransformer(
                        func=get_text)), ("shaper", Shaper(newshape=(-1, ))),
                   ("tf-idf",
                    TfidfVectorizer(analyzer="char_wb",
                                    ngram_range=(2, 3),
                                    dtype=np.float32,
                                    decode_error="replace",
                                    stop_words="english"))]))),
             ("combiner", CosineSimilarity())
         ])),
        ("sent_len_diff",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(dtype=None,
                                                                  func=len),
                              groupby=None)),
             ('abs_diff', AbsoluteDifference()),
         ])),
    ])

    # Train a classifier on these vectors
    if regressor == 'lasso':
        classifier = LassoLarsCV(cv=5, max_iter=512, n_jobs=-1)
    elif regressor == 'RF':
        classifier = RandomForestRegressor(n_jobs=-1,
                                           max_depth=8,
                                           n_estimators=1024)
    else:
        print('Error passing the regressor type')

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
            color='k',
            label='alpha: CV estimate')

plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_cv)
plt.axis('tight')
plt.ylim(2300, 4000)
plt.show()

# #############################################################################
# LassoLarsCV: least angle regression
t1 = time.time()
model = LassoLarsCV(cv=10)
model.fit(x, y)
t_lasso_lars_cv = time.time() - t1
alphas_log = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(alphas_log, model.mse_path_, ':')
plt.plot(alphas_log,
         model.mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
Exemple #10
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV

data = pd.read_csv("dataset.csv", header=0)

X = data.loc[:, ["Commune", "Etage", "Superficie", "Piece"]].values
Y = data.loc[:, "Prix"].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

regressor = LassoLarsCV(cv=15)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
Exemple #11
0
 build_auto(
     BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAuto")
 build_auto(DummyRegressor(strategy="median"), "DummyAuto")
 build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
 build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
            "ExtraTreesAuto")
 build_auto(GradientBoostingRegressor(random_state=13, init=None),
            "GradientBoostingAuto")
 build_auto(HuberRegressor(), "HuberAuto")
 build_auto(LarsCV(), "LarsAuto")
 build_auto(LassoCV(random_state=13), "LassoAuto")
 build_auto(LassoLarsCV(), "LassoLarsAuto")
 build_auto(OptimalLGBMRegressor(objective="regression",
                                 n_estimators=17,
                                 num_iteration=11),
            "LGBMAuto",
            num_iteration=11)
 build_auto(LinearRegression(), "LinearRegressionAuto")
 build_auto(
     BaggingRegressor(LinearRegression(),
                      random_state=13,
                      max_features=0.75), "LinearRegressionEnsembleAuto")
 build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto")
 build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3),
            "RandomForestAuto",
            flat=True)
 build_auto(RidgeCV(), "RidgeAuto")
## Scikit Learn                                                    ##
#####################################################################

lasso_model = LassoCV()
lasso_model.fit(x_train_values, y_train_values)
lasso_model_predictions = lasso_model.predict(x_test_values)
generate_submission_file(lasso_model_predictions, test_data["Id"],
                         "../results/" + user + "_LassoCV.csv")

lars_model = LarsCV()
lars_model.fit(x_train_values, y_train_values)
lars_model_predictions = lars_model.predict(x_test_values)
generate_submission_file(lars_model_predictions, test_data["Id"],
                         "../results/" + user + "_LarsCV.csv")

lassolars_model = LassoLarsCV()
lassolars_model.fit(x_train_values, y_train_values)
lassolars_model_predictions = lassolars_model.predict(x_test_values)
generate_submission_file(lassolars_model_predictions, test_data["Id"],
                         "../results/" + user + "_LassoLarsCV.csv")

en_model = ElasticNetCV()
en_model.fit(x_train_values, y_train_values)
en_model_predictions = en_model.predict(x_test_values)
generate_submission_file(en_model_predictions, test_data["Id"],
                         "../results/" + user + "_ElasticNetCV.csv")

#####################################################################
## XGBoost                                                         ##
#####################################################################
level_1_models = [
    XgbWrapper(seed=SEED, params=xgb_params1, cv_fold=4),
    XgbWrapper(seed=SEED, params=xgb_params2, cv_fold=4),
    #XgbWrapper(seed=SEED, params=xgb_params3),
    XgbWrapper(seed=SEED, params=xgb_params4, cv_fold=4)
]

# level_1_models = level_1_models + [SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params1),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params2),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params3),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params4)]

level_1_models = level_1_models + [
    SklearnWrapper(make_pipeline(ZeroCount(),
                                 LassoLarsCV(normalize=True))),  #LB 0.55797
    SklearnWrapper(
        make_pipeline(
            StackingEstimator(estimator=LassoLarsCV(normalize=True)),
            StackingEstimator(
                estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                    loss="huber",
                                                    max_depth=3,
                                                    max_features=0.55,
                                                    min_samples_leaf=18,
                                                    min_samples_split=14,
                                                    subsample=0.7)),
            LassoLarsCV()))
]

params_list = [
pl.imshow(bg.get_data()[:, :, 10], interpolation="nearest", cmap='gray')
pl.imshow(np.ma.masked_less(sbrain.get_data()[:, :, 10], 1e-6),
          interpolation="nearest",
          cmap='hot')
plot_lines(contour[:, :, 10])
pl.axis('off')
pl.subplots_adjust(left=0., right=1., bottom=0., top=1.)
pl.savefig('encoding_scores.pdf')
pl.savefig('encoding_scores.eps')
pl.clf()

### Compute receptive fields

from sklearn.linear_model import LassoLarsCV

lasso = LassoLarsCV(max_iter=10, )

p = (4, 2)
# Mask for chosen pixel
pixmask = np.zeros((10, 10), dtype=bool)
pixmask[p] = 1

for index in [1780, 1951, 2131, 1935]:
    rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10)
    pl.figure(figsize=(8, 8))
    pl.imshow(rf, vmin=0, interpolation="nearest", cmap='hot')
    plot_lines(pixmask, linewidth=6)
    pl.axis('off')
    pl.subplots_adjust(left=0., right=1., bottom=0., top=1.)
    pl.savefig('encoding_%d.pdf' % index)
    pl.savefig('encoding_%d.eps' % index)
Exemple #15
0
    #     pro =classify_model_001.predict_proba(X_testset_001[i])
    #     print pro[0]
print class_one, predict_one
print class_two, predict_two

## 构建回归模型
### 构建0.003的回归模型
from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV
X_trainset_0003 = []
y_trainset_0003 = []
for i in range(0, y_trainset.__len__(), 1):
    if y_trainset[i] < 0.003:
        X_trainset_0003.append(X_trainset[i])
        y_trainset_0003.append(y_trainset[i])

reg_0003 = LassoLarsCV()
reg_0003.fit(X_trainset_0003, y_trainset_0003)

X_testset_0003 = []
y_testset_0003 = []
for i in range(0, y_testset.__len__(), 1):
    if y_testset[i] < 0.003:
        X_testset_0003.append(X_testset[i])
        y_testset_0003.append(y_testset[i])
reg_0003_result = reg_0003.predict(X_testset_0003)
mse_0003 = 0.0
for i in range(0, y_testset_0003.__len__(), 1):
    print reg_0003_result[i], y_testset_0003[i]
    mse_0003 += abs(reg_0003_result[i] - y_testset_0003[i])
print mse_0003 / y_testset_0003.__len__()
Exemple #16
0
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(
                self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X),
                                              (-1, 1)), X_transformed))

        return X_transformed


stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                          loss="huber",
                                                          max_depth=3,
                                                          max_features=0.55,
                                                          min_samples_leaf=18,
                                                          min_samples_split=14,
                                                          subsample=0.7)),
    LassoLarsCV())

stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)
'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(
# Splitting the test and training data for building better prediction model
X_train, X_test, y_train, y_test = cv.train_test_split(predictors, y, test_size=0.2)
#print (X_train.shape, y_train.shape)
#print (X_test.shape, y_test.shape)
#Fitting the model
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
lasso = Lasso(alpha=1)
res = lasso.fit(X_train,y_train)
#print("Coefficients lasso training fit of", res.coef_.tolist())
print('Lasso:',lasso)


# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(X_train, y_train) #K fold is yen and not to use precomputed matrix.Here first fold is the validation set and the remaining 9 folds estimate the model

# print variable names and regression coefficients
print ('Coefficients from lasso lars',dict(zip(X_train.columns, model.coef_)) )#dic object creates dictionary and zip object creates lists

# Fit the regressor to the data
#las=lasso.fit(predictors, y)


#plot mean square error for each fold
print("Computing regularization path using the Lars lasso...")
m_log_alphascv = -np.log10(model.cv_alphas_)
#print("Log alphas:",m_log_alphascv,"MSE:",model.cv_mse_path_)
pyplot.figure()
pyplot.plot(m_log_alphascv, model.cv_mse_path_, ':')
pyplot.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
Exemple #18
0
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

# #############################################################################
# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
        rmse_l.append(rmse)
        cplx_l.append(cplx)

        fw = open(f'models/{dataname}_{algname}_{fold}_{it}.pkl', 'wb')
        pickle.dump(model, fw)
        fw.close()
        print(f'it: {it}, {rmse}, {cplx}')

    return dataset_l, algoritmo_l, fold_l, mae_l, rmse_l, cplx_l


dataset_l, algoritmo_l, fold_l = [], [], []
mae_l, rmse_l, cplx_l = [], [], []

algname = 'IT-ELM (Lasso)'
modelCV = LassoLarsCV(n_jobs=-1)
model_fn = ITELM

#for dataname in fnames:
dataname = sys.argv[1]
fold = int(sys.argv[2])
print(f'====================\nData set: {dataname}\n====================\n')

dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn,
                                                     algname, modelCV)

dataset_l += dat_l
algoritmo_l += alg_l
fold_l += f_l
mae_l += ab_l
rmse_l += sq_l
Exemple #20
0
                    tvt_modifier_baseline_reps, tvt_modifier_return_mean
                ])
            if rep in none_model_reps:
                predictions, model, results = validation_tools.make_predictions(
                    train, validate, test, metrics, None, run_type=run_type)
            else:

                model_fname = f"./models/{d}__{s}__{rep}__model.pkl"

                if to_train == True:

                    kfold = KFold(n_splits=10, random_state=42, shuffle=True)

                    model_to_pass = LassoLarsCV(fit_intercept=True,
                                                normalize=True,
                                                n_jobs=-1,
                                                max_n_alphas=6000,
                                                cv=kfold)
                else:
                    model_to_pass = joblib.load(model_fname)

                predictions, model, results = validation_tools.make_predictions(
                    train,
                    validate,
                    test,
                    metrics,
                    model=model_to_pass,
                    run_type=run_type,
                    to_train=to_train)
                if to_train == True:
                    joblib.dump(model, model_fname)
Exemple #21
0
    mseOLS = np.mean((bh['PRICE'] - lr.predict(x))**2)
    R2OLS = lr.score(x,y)
    print(mseOLS) ## MSE do modelo OLS ##
    print(R2OLS)  ## R² do modelo OLS ##

    ### 3) LARS ###

    import time
    import matplotlib.pyplot as plt
    from sklearn.linear_model import LassoLarsCV
    from sklearn import linear_model

    ## Computing regularization path using the Lars lasso... ##
    t1 = time.time()
    model = LassoLarsCV(cv=10).fit(x, y)
    t_lasso_lars_cv = time.time() - t1



    # Display results
    m_log_alphas = -np.log10(model.cv_alphas_)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
    plt.legend()
plt.xlabel(r"$\alpha$")
plt.ylabel("Mean square error")
plt.title(
    "Mean square error on each fold: coordinate descent (train time: %.2fs)" %
    t_lasso_cv)
plt.axis("tight")
plt.ylim(ymin, ymax)

# #############################################################################
# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=20, normalize=False).fit(X, y)
t_lasso_lars_cv = time.time() - t1

# Display results
plt.figure()
plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":")
plt.semilogx(
    model.cv_alphas_ + EPSILON,
    model.mse_path_.mean(axis=-1),
    "k",
    label="Average across the folds",
    linewidth=2,
)
plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV")
plt.legend()
                              learning_rate=0.05,
                              subsample=0.8),
    XGBRegressor(seed=0,
                 n_estimators=500,
                 max_depth=10,
                 learning_rate=0.05,
                 subsample=0.8,
                 colsample_bytree=0.75),
    XGBRegressor(seed=0,
                 n_estimators=500,
                 max_depth=7,
                 learning_rate=0.05,
                 subsample=0.8,
                 colsample_bytree=0.75),
    LassoCV(alphas=[1, 0.1, 0.001, 0.0005]),
    KNeighborsRegressor(n_neighbors=5),
    KNeighborsRegressor(n_neighbors=10),
    KNeighborsRegressor(n_neighbors=15),
    KNeighborsRegressor(n_neighbors=25),
    KNeighborsRegressor(n_neighbors=35),
    LassoLarsCV(),
    ElasticNet(),
    SVR()
]

ensem = ensemble(n_folds=5, stacker=Ridge(), base_models=base_models)

X_train, X_test, y_train = data_preprocess(train, test)
y_pred, score = ensem.fit_predict(X_train, X_test, y_train)

create_submission(np.expm1(y_pred), score)
Exemple #24
0
    return df_fea2, uni_feature


df_fea2, uni_feature = featureSelectSVC(X, y)
print(uni_feature)

## RandomizedLasso, feature stability selection
from sklearn.linear_model import (RandomizedLasso, lasso_stability_path,
                                  LassoLarsCV)
import warnings
from sklearn.exceptions import ConvergenceWarning

with warnings.catch_warnings():
    warnings.simplefilter('ignore', UserWarning)
    warnings.simplefilter('ignore', ConvergenceWarning)
    lars_cv = LassoLarsCV(cv=6).fit(X, y)

alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
names = df_merge3.columns.tolist()[:-1]
print(sorted(zip(map(lambda x: round(x, 4), clf.scores_), names),
             reverse=True))

from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
df_tree = pd.DataFrame(clf.feature_importances_)
df_tree['fea_index'] = df_merge3.columns.tolist()[:-1]
df_tree.columns = ["weight", "feature_index"]
df_tree.sort_values("weight").tail(10)
Exemple #25
0
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging1', model5, metrics1))
        else:
            model5 = LassoLarsCV(cv=scv)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('LassoLarsCV', model5, metrics1))
        model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                         cv=scv,
                         random_state=seed)
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = rmse(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('LassoCV', model6, metrics2))
        model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv)
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = rmse(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('RidgeCV', model7, metrics3))
        ## Create an ensemble model ####
        if Boosting_Flag:
            model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging2', model8, metrics4))
        else:
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                min_samples_leaf=2, max_depth=1, random_state=seed),
                                       n_estimators=NUMS,
                                       random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if verbose >= 2:
            print('QuickML_Ensembling Model results:')
            print(
                '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                % (estimator_names[0], metrics1, estimator_names[1], metrics2,
                   estimator_names[2], metrics3, estimator_names[3], metrics4))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging', model5, metrics1))
        else:
            model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20),
                                          cv=scv,
                                          scoring=scoring,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Logistic Regression', model5, metrics1))
        model6 = LinearDiscriminantAnalysis()
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = accu(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('Linear Discriminant', model6, metrics2))
        if modeltype == 'Binary_Classification':
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = GaussianNB()
        else:
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = MultinomialNB()
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = accu(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('Naive Bayes', model7, metrics3))
        if Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here.
            model8 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging', model8, metrics4))
        else:
            ## Create an ensemble model ####
            model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if not isinstance(y_test, str):
            if verbose >= 2:
                print('QuickML_Ensembling Model results:')
                print(
                    '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                    % (estimator_names[0], metrics1, estimator_names[1],
                       metrics2, estimator_names[2], metrics3,
                       estimator_names[3], metrics4))
        else:
            if verbose >= 1:
                print('QuickML_Ensembling completed.')
    stacks = np.c_[results1, results2, results3, results4]
    if verbose == 1:
        print('    Time taken for Ensembling: %0.1f seconds' %
              (time.time() - start_time))
    return estimator_names, stacks


#########################################################
def bagging_LassoLarsCV(X, Y, vrbl_names, n_estimators, p_smpl, n_jobs, max_n_estimators):
    
    from sklearn.model_selection import KFold, RepeatedKFold
    from sklearn.ensemble import BaggingRegressor
    from sklearn.linear_model import LassoLarsCV, LinearRegression
    
    cv = KFold(n_splits=5, shuffle=True)
    
    
    try: X = X.values
    except: pass
    try: Y = Y.values
    except: pass
    
    X = np.squeeze(X)
    Y = np.squeeze(Y)
    
    max_feats = int(X.shape[1]/3)
    eps = 2e-10

    fitted_ensemble = BaggingRegressor(
                    base_estimator=LassoLarsCV(cv=cv, eps=eps, max_iter=200, n_jobs=1),
                    #base_estimator=LinearRegression(n_jobs=1),
                    n_estimators=max_n_estimators, # Number of fittings
                    max_samples=0.5,   # Select 50% of training data per random sample
                    max_features=max_feats,   # Select N/3 variables randomly
                    bootstrap=False,   # 
                    bootstrap_features=False,
                    oob_score=False,
                    n_jobs=n_jobs,    #8,
                    random_state=70,
                    verbose=1).fit(X, Y) 
    
    
    all_sample_indices = np.arange(X.shape[0])
    feature_indices = fitted_ensemble.estimators_features_
    sample_indices  = fitted_ensemble.estimators_samples_
    outofs_indices  = []
    
    for i,smp in enumerate(sample_indices):
        out_sample = all_sample_indices[~np.isin(all_sample_indices, smp)]
        outofs_indices.append(out_sample)

    final_ensemble = []
    for i, estimator in enumerate(fitted_ensemble.estimators_):
        f_indices = feature_indices[i] 
        s_indices = sample_indices[i] 
        o_indices = outofs_indices[i] 
        a_indices = all_sample_indices
        true_indices = np.abs(estimator.coef_)>0
        
        # Definition of success in fitting: at least one predictor
        # needs to be found
        if(true_indices.sum() > 0):
            estimator_predictors = vrbl_names[f_indices][true_indices]
            n_predictors = true_indices.sum() 
            
            all_sample_score = calc_corr(Y[a_indices], estimator.predict(X[a_indices][:, f_indices]))
            
            # Append results and fitted models to the result list
            final_ensemble.append([estimator, estimator_predictors, 
                                    f_indices, n_predictors, all_sample_score])
            
    
    return final_ensemble
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

'''Train the stacked models then predict the test data'''

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(
        estimator=GradientBoostingRegressor(learning_rate=0.00900000000000005, loss="huber", max_depth=6, max_features=0.69000000000000005,
                                            min_samples_leaf=16, min_samples_split=14, subsample=0.8000000000001)),
    StackingEstimator(DecisionTreeRegressor(max_depth=4, min_samples_leaf=6, min_samples_split=13))
)

exported_pipeline.fit(finaltrainset, y_train)
results = exported_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train, exported_pipeline.predict(finaltrainset) * 0.2855 + model.predict(dtrain) * 0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''
Exemple #28
0
np.sum(lasso_lars.coef_ != 0)

lasso_lars = grid.best_estimator_
plt.scatter(range(X_poly.shape[1]),
            lasso_lars.coef_,
            c=np.sign(lasso_lars.coef_),
            cmap="bwr_r")

######## Yellowbrick

from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError
from sklearn.linear_model import LassoLarsCV

### Find optimal alpha

lassolars_yb = AlphaSelection(LassoLarsCV())
lassolars_yb.fit(X, y)
lassolars_yb.poof()

### RVF plot

lasso_yb = ResidualsPlot(lasso_lars, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
lasso_yb.poof()

### Prediction Error

lasso_yb = PredictionError(lasso_lars, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
predictors['x284_2012'] = preprocessing.scale(
    predictors['x284_2012'].astype('float64'))

#check if predictors for lasso regression are standardized and have mean=0 and
#sd=1 the means, standard deviation
for stats in predictors:
    print(predictors[stats].describe())

# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors,
                                                              target,
                                                              test_size=.3,
                                                              random_state=123)

# specify the lasso regression model
model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))

#dicionary of predictor variables retained in the model
dictionaryValues = dict(zip(predictors.columns, model.coef_))

#make copy of predictors to work towards predictors which were retained during the lasso regression
predictorsNew = predictors.copy()

#list to contain the retained values in Lasso Regression
listofVals = []
for predictorsNew, v in dictionaryValues.items():
    if v != 0.0:
        print(v)
                                               shuffle=False)
    x_train = ml.loc[train_index]
    y_train = ml_outs.loc[train_index]
    x_test = ml.loc[test_index]
    y_test = ml_outs.loc[test_index]

    # Scale
    scaler = StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    # Implemnent Model
    linreg = Lars()  # Better
    linreg = LarsCV()
    # one Better
    linreg = LassoLarsCV()  # Same
    linreg = LinearRegression()
    linreg.fit(x_train, y_train)
    predictions = linreg.predict(x_test)

    # Plot predictions and y_test
    plt.figure()
    plt.plot(predictions, label='Predictions')
    plt.plot(pd.Series(predictions).rolling(5).mean(),
             label='rolling predictions')
    plt.plot(y_test.values,
             label='Shifted Currencies ( y_test values',
             color='grey')
    plt.plot(cu.loc[test_index, currency].values, label='UNSHIFTED')
    plt.legend()
    plt.show()