Esempio n. 1
0
def main():

    # Read arguments
    parser = argparse.ArgumentParser(
        description="Build model for regression/classification")
    parser.add_argument("--config_file", type=str, required=True)
    parser.add_argument(
        "--max_events",
        type=int,
        default=-1,
        help="maximum number of events for training",
    )
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
        "--wave",
        dest="mode",
        action="store_const",
        const="wave",
        default="tail",
        help="if set, use wavelet cleaning",
    )
    mode_group.add_argument(
        "--tail",
        dest="mode",
        action="store_const",
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets",
    )
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Type of model (regression or classification)
    model_type = cfg["General"]["model_type"]

    # Import parameters
    data_dir = cfg["General"]["data_dir"]
    outdir = cfg["General"]["outdir"]
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    cam_ids = cfg["General"]["cam_id_list"]
    table_name_template = cfg["General"]["table_name_template"]
    table_name = [table_name_template + cam_id for cam_id in cam_ids]

    # List of features
    feature_list = cfg["FeatureList"]

    # Optimisation parameters
    method_name = cfg["Method"]["name"]
    tuned_parameters = [cfg["Method"]["tuned_parameters"]]
    scoring = "explained_variance"
    cv = cfg["Method"]["cv"]

    # Split fraction
    train_fraction = cfg["Split"]["train_fraction"]

    if model_type in "regressor":
        data_file = cfg["General"]["data_file"].format(args.mode)
        filename = path.join(data_dir, data_file)

        # List of cuts
        cuts = make_cut_list(cfg["SigFiducialCuts"])
        init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None))

        # Name of target
        target_name = cfg["Method"]["target_name"]

    elif model_type in "classifier":
        data_sig_file = cfg["General"]["data_sig_file"].format(args.mode)
        data_bkg_file = cfg["General"]["data_bkg_file"].format(args.mode)
        filename_sig = path.join(data_dir, data_sig_file)
        filename_bkg = path.join(data_dir, data_bkg_file)

        # List of cuts
        sig_cuts = make_cut_list(cfg["SigFiducialCuts"])
        bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"])

        # Model
        if method_name in "AdaBoostClassifier":
            init_model = AdaBoostClassifier(
                DecisionTreeClassifier(max_depth=4))
        elif method_name in "RandomForestClassifier":
            init_model = RandomForestClassifier(
                n_estimators=500,
                max_depth=None,
                min_samples_split=0.05,
                max_features="sqrt",
                bootstrap=True,
                random_state=None,
                criterion="gini",
                class_weight=
                "balanced_subsample",  # Reweight events for each tree
            )
        use_same_number_of_sig_and_bkg_for_training = cfg["Split"][
            "use_same_number_of_sig_and_bkg_for_training"]

    print("### Using {} for model construction".format(method_name))

    models = dict()
    for idx, cam_id in enumerate(cam_ids):

        print("### Building model for {}".format(cam_id))

        if model_type in "regressor":
            # Load data
            data = pd.read_hdf(filename, table_name[idx], mode="r")
            data = prepare_data(ds=data, cuts=cuts)[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name=target_name,
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(data_sig=data, train_fraction=train_fraction)
            print("Training sample: sig {}".format(len(factory.data_train)))
            print("Test sample: sig {}".format(len(factory.data_test)))
        elif model_type in "classifier":
            # Load data
            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
            data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r")

            # Add label
            data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts)
            data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts)

            data_sig = data_sig[0:args.max_events]
            data_bkg = data_bkg[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name="label",
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(
                data_sig=data_sig,
                data_bkg=data_bkg,
                train_fraction=train_fraction,
                force_same_nsig_nbkg=
                use_same_number_of_sig_and_bkg_for_training,
            )

            print("Training sample: sig {} and bkg {}".format(
                len(factory.data_train.query("label==1")),
                len(factory.data_train.query("label==0")),
            ))
            print("Test sample: sig {} and bkg {}".format(
                len(factory.data_test.query("label==1")),
                len(factory.data_test.query("label==0")),
            ))

        # Build model
        best_model = factory.get_optimal_model(init_model,
                                               tuned_parameters,
                                               scoring=scoring,
                                               cv=cv)

        if model_type in "classifier":
            # print report
            if model_type in "classifier":
                print(
                    classification_report(
                        factory.data_scikit["y_test"],
                        best_model.predict(factory.data_scikit["X_test"]),
                    ))

            # Calibrate model if necessary on test data
            if cfg["Method"]["calibrate_output"] is True:
                print("==> Calibrate classifier...")

                best_model = CalibratedClassifierCV(best_model,
                                                    method="sigmoid",
                                                    cv="prefit")

                best_model.fit(factory.data_scikit["X_test"],
                               factory.data_scikit["y_test"])

        # save model
        models[cam_id] = best_model
        outname = "{}_{}_{}_{}.pkl.gz".format(model_type, args.mode, cam_id,
                                              method_name)
        joblib.dump(best_model, path.join(outdir, outname))

        # save data
        save_obj(
            factory.data_scikit,
            path.join(
                outdir,
                "data_scikit_{}_{}_{}_{}.pkl.gz".format(
                    model_type, method_name, args.mode, cam_id),
            ),
        )
        factory.data_train.to_pickle(
            path.join(
                outdir,
                "data_train_{}_{}_{}_{}.pkl.gz".format(model_type, method_name,
                                                       args.mode, cam_id),
            ))
        factory.data_test.to_pickle(
            path.join(
                outdir,
                "data_test_{}_{}_{}_{}.pkl.gz".format(model_type, method_name,
                                                      args.mode, cam_id),
            ))
Esempio n. 2
0
#     for metric in results.keys():
#         print("%s: %.3f" % (metric, np.average(results[metric])))
# print("KnnClassifier")
# parameters= [3,5,15,121]
# for K in parameters:
#     print("n_estimators")
#     print(K)
#     reg = KNeighborsClassifier(n_neighbors=K,n_jobs=-1,algorithm='kd_tree',leaf_size=500)
#     cross_val_score(reg, X_new, y, scoring=scorer3, cv=KFold(n_splits=3))
#     results = scorer3.get_results()
#     for metric in results.keys():
#         print("%s: %.3f" % (metric, np.average(results[metric])))
'''-----------------------TEST-4--------------------------------------------'''
Regs = {
    "linera_model": linear_model.Ridge(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=10),
    "GaussianNB": GaussianNB()
}
for reg in Regs:
    print(reg)
    cross_val_score(Regs.get(reg),
                    X_new,
                    y,
                    scoring=scorer,
                    cv=KFold(n_splits=5))
    results = scorer.get_results()
    for metric in results.keys():
        print("%s: %.3f" % (metric, np.average(results[metric])))
'''--------------------------------------------------------------------------'''
'''-------------------------------------------------PART2---TEST1--------------------------------------------'''
'''test  class_whights '''
Esempio n. 3
0
test_id = test['id']
train_id = train['id']
train = train.drop(['id', 'labels'], axis=1)
test = test.drop('id', axis=1)
shift = 200

print("Script name:", sys.argv[0])
args = dict([arg.split('=', maxsplit=1) for arg in sys.argv[1:]])
print(args)

ESTIMATORS = {
    "encv": ElasticNetCV(),
    "rfr": RandomForestRegressor(n_estimators=250),
    "svr": SVR(C=1.0, epsilon=0.2),
    "gbr": GradientBoostingRegressor(n_estimators=250),
    "adb": AdaBoostRegressor(n_estimators=250),
    "knn4": KNeighborsRegressor(n_neighbors=4)
}

test_predictions = pd.DataFrame({'id': test_id, 'loss': np.nan})
test_predictions.set_index(['id'])

name = args['classifier']
output = args.get("output", name + '_predictions.csv')

if name in ESTIMATORS.keys():
    estimator = ESTIMATORS[name]
    estimator.fit(train, train_labels)
    test_labels = np.exp(estimator.predict(test)) - shift
    test_predictions = test_predictions.assign(loss=test_labels)
    test_predictions.to_csv(output, index=False)
Esempio n. 4
0
#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)
Esempio n. 5
0
a3 = rfReg.predict(testX[varsUsed])
print(mean_squared_error(testY['score'], est.predict(testX[varsUsed])))
rfreg_tuned_parameters = [{'max_depth':[1,2,3],'n_estimators':[50,100,150,200]}]
rfregGS = ms.GridSearchCV(RandomForestRegressor(),rfreg_tuned_parameters,cv=5,scoring='neg_mean_squared_error')
rfregGS.fit(trainX[varsUsed],trainY['score'])
a1 = ms.ParameterGrid(rfreg_tuned_parameters)
scoresave = np.zeros(len(a1))
for i in range(len(a1)):
    rfregmgs = RandomForestRegressor(**a1[i])
    rfregmgs.fit(trainX[varsUsed],trainY['score'])    
    y_pred = rfregmgs.predict(testX[varsUsed])
    ndcg = util.ndcg.ndcg(testX[['srch_id','prop_id']],  testY['score'], y_pred)
    ndcg
    scoresave[i] = ndcg
    
adaReg = AdaBoostRegressor()
adaReg.fit(trainX[varsUsed],trainY['score'])
print(mean_squared_error(testY['score'],adaReg.predict(testX[varsUsed])))
ada_tuned_parameters = [{'loss':['linear','square'],'learning_rate':[0.5,1,2],'n_estimators':[50,100,150,25]}]
adaGS = ms.GridSearchCV(AdaBoostRegressor(),ada_tuned_parameters,cv=5,scoring='neg_mean_squared_error')
adaGS.fit(trainX[varsUsed],trainY['score'])
print(adaGS.score(testY['score'],adaGS.predict(testX[varsUsed])))
print(adaGS.best_params_)
print(adaGS.best_score_)
#predMat = pd.DataFrame()

rfReg = RandomForestRegressor(n_estimators=100)
rfReg.fit(trainX[varsUsed], trainY['score'])
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                max_depth=1, random_state=0, loss='ls').fit(trainX[varsUsed], trainY['score'])
    # step 4. score
    print('prediction score: ', end="")
    print_score(y_test, y_pred)
    print('{:.2f} seconds '.format(time() - start))

from sklearn.ensemble import AdaBoostRegressor

# Create the dataset
rng = np.random.RandomState(1)
# X = np.linspace(0, 6, 100)[:, np.newaxis]
# y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])

# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300,
                           random_state=rng)
regr_1 = dtr
regr_2 = AdaBoostRegressor(dtr, n_estimators=300, random_state=rng)

regr_1.fit(X_train, y_train)
regr_2.fit(X_train, y_train)

# Predict
y_1 = regr_1.predict(X_train)
y_2 = regr_2.predict(X_train)

# Plot the results
plt.figure()
plt.scatter(X_train, y_train, c="k", label="training samples")
plt.plot(X_train, y_1, c="g", label="n_estimators=1", linewidth=2)
Esempio n. 7
0
X_train = train.drop(columns=['Cases', 'Date'])
y_train = train[target]
X_test = test.drop(columns=['Cases', 'Date'])
y_test = test[target]


import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostRegressor


pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    AdaBoostRegressor(n_estimators=200, random_state=42)
    
)


pipeline.fit(X_train, y_train)

y_test = y_test.fillna(y_test.mean())


y_pred = pipeline.predict(X_test)
r2_score(y_test, y_pred)

test['predicted_cases'] = pd.DataFrame(y_pred)
test_california = test[test['Province_State'].str.contains('California') & (test['Case_Type'].str.contains('Confirmed'))]
    if isinstance(clf, (DecisionTreeClassifier, OneVsRestClassifier)):
        if _graphviz.is_supported():
            assert '<svg' in expl_html
        else:
            assert '<svg' not in expl_html

    assert res == get_res()


@pytest.mark.parametrize(['reg'], [
    [DecisionTreeRegressor(random_state=42)],
    [ExtraTreesRegressor(random_state=42)],
    [GradientBoostingRegressor(learning_rate=0.075, random_state=42)],
    [RandomForestRegressor(random_state=42)],
    [AdaBoostRegressor(random_state=42)],
])
def test_explain_tree_regressor(reg, boston_train):
    X, y, feature_names = boston_train
    reg.fit(X, y)
    res = explain_weights(reg, feature_names=feature_names)
    expl_text, expl_html = format_as_all(res, reg)
    for expl in [expl_text, expl_html]:
        assert 'BIAS' not in expl
        assert 'LSTAT' in expl

    if isinstance(reg, DecisionTreeRegressor):
        assert '---> 50' in expl_text


@pytest.mark.parametrize(['clf'], [
#     RandomForestClassifier(),
#     AdaBoostClassifier(),
#     GradientBoostingClassifier()
#     ]for classifier in classifiers:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', classifier)])
#     pipe.fit(X_train, y_train)
#     print(classifier)
#     print("model score: %.3f" % pipe.score(X_test, y_test))
# ```

classifiers = [
    SVR(),
    DecisionTreeRegressor(random_state=random_seed),
    RandomForestRegressor(random_state=random_seed),
    AdaBoostRegressor(random_state=random_seed),
    GaussianProcessRegressor(random_state=random_seed),
    LinearRegression(),
    MLPRegressor(random_state=random_seed)
]

grid_params = {
    'SVR': {
        'SVR__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'SVR__C': list(np.logspace(-5, 15, num=11, base=2)),
        'SVR__gamma': list(np.logspace(-15, 3, num=10, base=2)),
    },
    'DecisionTreeRegressor': {
        'DecisionTreeRegressor__criterion': ['mse', 'friedman_mse', 'mae'],
        'DecisionTreeRegressor__max_depth':
        list(np.linspace(1, 32, 32, endpoint=True)),
Esempio n. 10
0
    def set_models(self, modelos=None):
        rs = 1
        models = []
        if (self.problem_type == "Classification"):
            # Ensemble Methods
            if 'AdaBoostClassifier' in modelos:
                models.append(('AdaBoostClassifier',
                               AdaBoostClassifier(random_state=rs)))
            if 'GradientBoostingClassifier' in modelos:
                models.append(('GradientBoostingClassifier',
                               GradientBoostingClassifier(random_state=rs)))
            if 'BaggingClassifier' in modelos:
                models.append(
                    ('BaggingClassifier', BaggingClassifier(random_state=rs)))
            if 'RandomForestClassifier' in modelos:
                models.append(('RandomForestClassifier',
                               RandomForestClassifier(random_state=rs)))
            if 'ExtraTreesClassifier' in modelos:
                models.append(('ExtraTreesClassifier',
                               ExtraTreesClassifier(random_state=rs)))
            # Non linear Methods
            if 'KNeighborsClassifier' in modelos:
                models.append(('KNeighborsClassifier', KNeighborsClassifier()))
            if 'DecisionTreeClassifier' in modelos:
                models.append(('DecisionTreeClassifier',
                               DecisionTreeClassifier(random_state=rs)))
            if 'MLPClassifier' in modelos:
                models.append(('MLPClassifier',
                               MLPClassifier(max_iter=1000, random_state=rs)))
            if 'SVC' in modelos:
                models.append(('SVC', SVC(random_state=rs)))
            # Linear Methods
            if 'LinearDiscriminantAnalysis' in modelos:
                models.append(('LinearDiscriminantAnalysis',
                               LinearDiscriminantAnalysis()))
            if 'GaussianNB' in modelos:
                models.append(('GaussianNB', GaussianNB()))
            if 'LogisticRegression' in modelos:
                models.append(('LogisticRegression', LogisticRegression()))
            # Voting
            #estimators = []
            #estimators.append( ("Voting_GradientBoostingClassifier", GradientBoostingClassifier(random_state=rs)) )
            #estimators.append( ("Voting_ExtraTreesClassifier", ExtraTreesClassifier(random_state=rs)) )
            #voting = VotingClassifier(estimators)
            #if 'VotingClassifier'  in modelos:
            #    models.append( ('VotingClassifier', voting) )

        elif (self.problem_type == "Regression"):
            # Ensemble Methods
            if 'AdaBoostRegressor' in modelos:
                models.append(
                    ('AdaBoostRegressor', AdaBoostRegressor(random_state=rs)))
            if 'GradientBoostingRegressor' in modelos:
                models.append(('GradientBoostingRegressor',
                               GradientBoostingRegressor(random_state=rs)))
            if 'BaggingRegressor' in modelos:
                models.append(
                    ('BaggingRegressor', BaggingRegressor(random_state=rs)))
            if 'RandomForestRegressor' in modelos:
                models.append(('RandomForestRegressor',
                               RandomForestRegressor(random_state=rs)))
            if 'ExtraTreesRegressor' in modelos:
                models.append(('ExtraTreesRegressor',
                               ExtraTreesRegressor(random_state=rs)))
            # Non linear Methods
            if 'KNeighborsRegressor' in modelos:
                models.append(('KNeighborsRegressor', KNeighborsRegressor()))
            if 'DecisionTreeRegressor' in modelos:
                models.append(('DecisionTreeRegressor',
                               DecisionTreeRegressor(random_state=rs)))
            if 'MLPRegressor' in modelos:
                models.append(('MLPRegressor',
                               MLPRegressor(max_iter=1000, random_state=rs)))
            if 'SVR' in modelos:
                models.append(('SVR', SVR()))
            # Linear Methods
            if 'LinearRegression' in modelos:
                models.append(('LinearRegression', LinearRegression()))
            if 'BayesianRidge' in modelos:
                models.append(('BayesianRidge', LinearRegression()))
        return models
Esempio n. 11
0
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import OneHotEncoder, StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-805.6529764814633
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=7, min_samples_leaf=4, min_samples_split=8)),
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=9, min_samples_leaf=11, min_samples_split=19)),
    OneHotEncoder(minimum_fraction=0.2, sparse=False, threshold=10),
    StackingEstimator(estimator=AdaBoostRegressor(
        learning_rate=0.5, loss="square", n_estimators=100)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.8, tol=0.001)),
    KNeighborsRegressor(n_neighbors=3, p=2, weights="distance"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 12
0
    y = np.array(y)

    # Define classifiers to try: (clf, name) pairs

    classifiers = [
        (LinearRegression(n_jobs=-1), 'LinearRegression'),
        (RandomForestRegressor(n_estimators=100, n_jobs=-1,
                               random_state=0), "RandomForest"),
        (GradientBoostingRegressor(n_estimators=100,
                                   random_state=0), "GradientBoost"),
        (ExtraTreesRegressor(n_estimators=100, random_state=0), "ExtraTrees"),
        (DecisionTreeRegressor(random_state=0), "DecisionTrees"),
        (BaggingRegressor(n_estimators=100, n_jobs=-1,
                          random_state=0), "Bagging"),
        (AdaBoostRegressor(n_estimators=100, random_state=0), "AdaBoost")
        # ,
        #            (XGBRegressor(n_estimators=100, n_jobs=-1, randomstate=0), "XGBoost")
    ]

    ######## SQUID Prediction

    # Store all ROC curves here:
    squid_rocs = []

    for clf, name in classifiers:
        print("Evaluating %s classifier (squid)" % name)
        mae, r2 = cross_validate_and_plot(clf, X, y, cols, name + "_squid",
                                          splits)
        squid_rocs = [name, mae, r2]
showPredictionValidation(y_train, y_test, X_test, X_valid, df_result)

print(mae(y_test, y_pred))

print(mse(y_test, y_pred))

print(r2(y_test, y_pred))

pipelines = []
# =============================================================================

pipelines.append(('DSTR', DecisionTreeRegressor()))
pipelines.append(('GBM', GradientBoostingRegressor()))
pipelines.append(('RDMF', RandomForestRegressor()))
pipelines.append(('ADAB', AdaBoostRegressor()))
pipelines.append(('ETR', ExtraTreesRegressor()))
pipelines.append(('BAGR', BaggingRegressor()))
pipelines.append(('KNNR', KNeighborsRegressor(n_neighbors=7)))
#pipelines.append(('LR', LinearRegression()))
#pipelines.append(('Ridge', Ridge()))
#pipelines.append(('Lasso', Lasso()))
#pipelines.append(('SVR', SVR()))

## =============================================================================


def apply_loocv(X_train, y_train, X_test, y_test):
    dict = {}
    results = []
    names = []
    return(x_train,y,x_test)

x_train , y ,x_test = get_train_test(tr,ts)

# In[13]:

from sklearn.ensemble import AdaBoostRegressor
from sklearn.grid_search import GridSearchCV


# ### ADB

# In[14]:

params_adb = [{'learning_rate' : [1,1.2,1.5,1.7,2] ,'n_estimators' : [300,400,500]}]


# In[15]:

gsearch = GridSearchCV(estimator= AdaBoostRegressor(),
                       param_grid = params_adb, scoring='mean_squared_error',n_jobs=50,cv=5,verbose=10)


# In[ ]:

gsearch.fit(x_train,y)


print(gsearch.best_params_)
print(gsearch.grid_scores_)
Esempio n. 15
0
def price_predictions(ticker, start, end, forecast_out):
    file_path = symbol_to_path(ticker)
    df = pd.read_csv(file_path,
                     index_col="<DTYYYYMMDD>",
                     parse_dates=True,
                     usecols=[
                         "<DTYYYYMMDD>", "<OpenFixed>", "<HighFixed>",
                         "<LowFixed>", "<CloseFixed>", "<Volume>"
                     ],
                     na_values="nan")
    df = df.rename(
        columns={
            '<DTYYYYMMDD>': 'Date',
            "<OpenFixed>": 'Open',
            '<HighFixed>': 'High',
            '<LowFixed>': 'Low',
            '<CloseFixed>': 'Close',
            '<Volume>': 'Volume'
        })

    # columns order for backtrader type
    columnsOrder = ["Open", "High", "Low", "Close", "Volume", "OpenInterest"]
    # change the index by new index
    df = df.reindex(columns=columnsOrder)
    # change date index to increasing order
    df = df.sort_index()
    # take a part of dataframe
    df = df.loc[start:end]

    df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
    bbwindow = 25
    vlwindow = 10
    mmtum = 10
    df['BB_Value'] = compute_indicator_bb(df, window=bbwindow)
    df['Volatility'] = compute_indicator_volatility(df, timeperiod=vlwindow)
    df['Momentum'] = talib.MOM(df['Close'].values, timeperiod=mmtum)
    df['OBV'] = talib.OBV(df['Close'].values,
                          df['Volume'].values.astype(np.float64))
    df['MACD'], _, _ = talib.MACD(df['Close'].values,
                                  fastperiod=12,
                                  slowperiod=26,
                                  signalperiod=9)
    _, df['STOCH'] = talib.STOCH(df['High'].values,
                                 df['Low'].values,
                                 df['Close'].values,
                                 fastk_period=14,
                                 slowk_period=1,
                                 slowd_period=5)
    df['MFI'] = talib.MFI(df['High'].values,
                          df['Low'].values,
                          df['Close'].values,
                          df['Volume'].values.astype(np.float64),
                          timeperiod=14)
    #    df['EMA3'] = pd.Series(pd.Series.ewm(df['Close'], span = 3, min_periods = 3-1).mean())
    #    df['EMA6'] = pd.Series(pd.Series.ewm(df['Close'], span = 6, min_periods = 6-1).mean())
    #    df['EMA18'] = pd.Series(pd.Series.ewm(df['Close'], span = 18,  min_periods = 18-1).mean())
    df['PDI'] = talib.PLUS_DI(df['High'].values,
                              df['Low'].values,
                              df['Close'].values,
                              timeperiod=14)
    df['NDI'] = talib.MINUS_DI(df['High'].values,
                               df['Low'].values,
                               df['Close'].values,
                               timeperiod=14)
    #    df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume','BB_Value',
    #                        'Volatility', 'Momentum', 'MACD', 'STOCH', 'MFI', 'OBV']]
    #
    df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume', 'BB_Value']]
    df.fillna(method="ffill", inplace=True)
    df.fillna(method="backfill", inplace=True)

    forecast_col = 'Close'

    #inplace : boolean, default False
    # If True, fill in place. Note: this will modify any other views on this object,
    # (e.g. a no-copy slice for a column in a DataFrame).
    # Du bao 1% cua du lieu
    # Copy du lieu tu cot Adj. Close vao cot moi
    # Lenh Shift
    df['Target'] = df[forecast_col].shift(-forecast_out)
    # Lenh Drop loai bo label
    #axis : int or axis name: column
    # Whether to drop labels from the index (0 / ‘index’) or columns (1 / ‘columns’).
    X = np.array(df.drop(['Target'], 1))
    y_true = df[forecast_col][-forecast_out:]
    # Preprocessing Input Data
    X = preprocessing.scale(X)

    #from sklearn.preprocessing import MinMaxScaler
    #scaler = MinMaxScaler()
    #X = scaler.fit_transform(X)

    # Tach gia tri X va X_lately ra khoi chuoi
    X_lately = X[-forecast_out:]

    X = X[:-forecast_out]
    # Loai bo cac gia tri NA
    # df.dropna(inplace=True)
    # Target la vector y lay tu cot label
    y = np.array(df['Target'].dropna())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    #X_train, X_test, y_train, y_test = train_test_split(X, y)

    #from sklearn.preprocessing import MinMaxScaler
    #from sklearn.preprocessing import StandardScaler
    #scaler = MinMaxScaler()
    #scaler = StandardScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)
    #X_lately = scaler.transform(X_lately)

    n_neighbors = 5
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')
    knn.fit(X_train, y_train)
    print('Train score KNN: ', knn.score(X_train, y_train),
          'Test score KNN : ', knn.score(X_test, y_test))
    forecast_set = knn.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    bagging = BaggingRegressor(DecisionTreeRegressor(),
                               n_estimators=50,
                               random_state=50)
    bagging.fit(X_train, y_train)
    print('Train score BAG: ', bagging.score(X_train, y_train),
          'Test score BAG : ', bagging.score(X_test, y_test))
    forecast_set = bagging.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    rf = RandomForestRegressor(n_estimators=50, random_state=50)
    rf.fit(X_train, y_train)
    print('Train score RF: ', rf.score(X_train, y_train), 'Test score RF : ',
          rf.score(X_test, y_test))
    forecast_set = rf.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    adaboost = AdaBoostRegressor(neighbors.KNeighborsRegressor(n_neighbors=5),
                                 n_estimators=30,
                                 random_state=0)

    #adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
    #                          n_estimators=30, random_state=0)
    adaboost.fit(X_train, y_train)
    print('Train score Ada: ', adaboost.score(X_train, y_train),
          'Test score Ada : ', adaboost.score(X_test, y_test))
    forecast_set = adaboost.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)
    def read_constructor_json(self):
        for i in self.pipeline_constructor_json['estimators']:
            model_not_found = False

            if self.pipeline_constructor_json['estimators'][i][
                    'model'] == 'RandomForestRegressor':
                model = RandomForestRegressor(n_jobs=-1)
            elif self.pipeline_constructor_json['estimators'][i][
                    'model'] == 'Lasso':
                model = Lasso()
            elif self.pipeline_constructor_json['estimators'][i][
                    'model'] == 'LinearRegression':
                model = LinearRegression()
            elif self.pipeline_constructor_json['estimators'][i][
                    'model'] == 'KNeighborsRegressor':
                model = KNeighborsRegressor()
            elif self.pipeline_constructor_json['estimators'][i][
                    'model'] == 'AdaBoostRegressor':
                model = AdaBoostRegressor()
            else:
                model_not_found = True

            if model_not_found:
                print('Unidentfied estimator: ' +
                      self.pipeline_constructor_json['estimators'][i]['model'])
            else:
                self.estimators.append({
                    'model':
                    model,
                    'parameters':
                    self.pipeline_constructor_json['estimators'][i]
                    ['parameters']
                })

        for i in self.pipeline_constructor_json['pre-estimators']:
            model_not_found = False

            if self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'VarianceThreshold':
                model = VarianceThreshold()
            elif self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'SelectKBest':
                model = CustomSelectKBest()
            elif self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'MinMaxScaler':
                model = MinMaxScaler()
            elif self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'StandardScaler':
                model = StandardScaler()
            elif self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'RFE':
                model = RFE(estimator=DecisionTreeRegressor())
            elif self.pipeline_constructor_json['pre-estimators'][i][
                    'model'] == 'SimpleImputer':
                model = SimpleImputer()
            else:
                model_not_found = True

            if model_not_found:
                print('Unidentfied pre-estimator: ' +
                      self.pipeline_constructor_json['estimators'][i]['model'])
            else:
                self.pre_estimators.append({
                    'model':
                    model,
                    'parameters':
                    self.pipeline_constructor_json['pre-estimators'][i]
                    ['parameters']
                })
Esempio n. 17
0
predictors = studies_no_encoding.drop('aadb',axis=1)
rf = h2o.estimators.H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)
rf.train(x=predictors, y=response, training_frame=h2o.H2OFrame(studies_no_encoding))

h2o.H2OFrame(studies_no_encoding)

X = studies.drop('aadb',axis=1)
y = studies['aadb']

scaler = StandardScalar()
X_scaled = preprocessing.StandardScaler(X)


pipeline = make_pipeline([
    ('lasso', linear_model.Lasso()),
    ('ada', AdaBoostRegressor()),
])
parameters = [
    {
        'clf': (linear_model.Lasso(),)
    }, {
        'clf': (AdaBoostRegressor(),),
        'clf__n_estimators': (1, 5, 25, 100)
    }
]
grid_search = GridSearchCV(pipeline, parameters, cv=10, scoring = 'neg_mean_squared_error')


len(studies)

## Don't expect linear models to do well
                                                        test_size=0.2,
                                                        random_state=1)
    return (X_train, X_test, y_train, y_test), column_names


'''Question 3: 3 classes of algorithm'''
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=1)

gdbr = GradientBoostingRegressor(learning_rate=0.1,
                                 loss='ls',
                                 n_estimators=100,
                                 random_state=1)

abr = AdaBoostRegressor(DecisionTreeRegressor(),
                        learning_rate=0.1,
                        loss='linear',
                        n_estimators=100,
                        random_state=1)

k_fold = KFold(n_splits=5, shuffle=True)


def cv_mse_r2(model):
    ''' Takes an instantiated model (estimator) and returns the average
        mean square error (mse) and coefficient of determination (r2) from
        kfold cross-validation.
        Parameters: estimator: model object
                    X_train: 2d numpy array
                    y_train: 1d numpy array
                    nfolds: the number of folds in the kfold cross-validation
        Returns:  mse: average mean_square_error of model over number of folds
Esempio n. 19
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase '
    #           'training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search is True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,
        # 000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will '
            'be used to initialize the model:')
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor()
        model_map['CatBoostClassifier'] = CatBoostClassifier()

    if model_name[:12] == 'DeepLearning':
        if keras_installed is False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed,
            # and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                # TODO: Fix bare Except
                pass

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to '
            'import it, or using a value for model_names that we do not recognize.'
        )
        raise e

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
    'MSZoning', 'CentralAir', 'KitchenQual', 'Neighborhood', 'Condition1',
    'Heating'
]

numerical_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', numerical_transformer,
    numerical_features), ('cat', categorical_transformer,
                          categorical_features)])

regressor = AdaBoostRegressor(
    n_estimators=26,
    base_estimator=DecisionTreeRegressor(max_depth=20),
    learning_rate=1.36)

my_pipeline = Pipeline(steps=[('preprocessor',
                               preprocessor), ('model', regressor)])

# test_X.fillna({'KitchenQual': 'TA', 'MSZoning': 'RL'}, inplace=True)
# train_X, test_X = prepare_categorical_features(train_X, test_X, categorical_features, numerical_features)

print("Fitting regressor...")
# regressor.fit(train_X, train_y)
my_pipeline.fit(train_X, train_y)

print("Predicting labels...")
# test_predictions = regressor.predict(test_X)
test_predictions = my_pipeline.predict(test_X)
Esempio n. 21
0
                    param_grid=param_grid,
                    scoring=scoring,
                    cv=kfold)
grid_result = grid.fit(X=rescaledX, y=y_train)

print('最优: {} 使用{}'.format(grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('{} ({}) with {}'.format(mean, std, param))

# b)集成算法
ensembles = {}  # models
ensembles['ScalerAB'] = Pipeline([('Scaler', StandardScaler()),
                                  ('AB', AdaBoostRegressor())])
ensembles['ScalerAB-KNN'] = Pipeline([
    ('Scaler', StandardScaler()),
    ('ABKNN',
     AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))
])
ensembles['ScalerAB-LR'] = Pipeline([
    ('Scaler', StandardScaler()),
    ('ABLR', AdaBoostRegressor(base_estimator=LinearRegression()))
])
ensembles['ScalerRFR'] = Pipeline([('Scaler', StandardScaler()),
                                   ('RFR', RandomForestRegressor())])
ensembles['ScalerETR'] = Pipeline([('Scaler', StandardScaler()),
                                   ('ETR', ExtraTreesRegressor())])
ensembles['ScalerGBR'] = Pipeline([('Scaler', StandardScaler()),
                                   ('GBR', GradientBoostingRegressor())])
Esempio n. 22
0
# In[334]:

#Random Forest
RF = RandomForestRegressor(n_estimators=20, random_state=36)
RF.fit(X_train, y_train)
y_pred_RF = RF.predict(X_test)
print("Random Forest Regression R^2 value: " +
      str((r2_score(y_test, y_pred_RF))))
print("Random Forest Regression MSE value: " +
      str(mean_squared_error(y_test, y_pred_RF)))

# In[337]:

#Ada boost regressor
ADA = AdaBoostRegressor()
ADA.fit(X_train, y_train)
y_pred_ADA = ADA.predict(X_test)
print("Ada Boost Regression R^2 value: " + str(r2_score(y_test, y_pred_ADA)))
print("Ada Boost Regression MSE value: " +
      str(mean_squared_error(y_test, y_pred_ADA)))

# Random forest seems like the best regression model. As a result I am going to validate the results of the Random Forest model for comparison with classification techniques.

# In[338]:

r2_scores_random_forest = []
MSE_scores_random_forest = []

kf = KFold(n_splits=len(df_columns_list))
Esempio n. 23
0
    GradientBoostingRegressor(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
    SVR(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR
    LinearSVR(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html
    ElasticNet(
        alpha=0.001, max_iter=10000
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
    SGDRegressor(
        max_iter=10000, tol=1e-3
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
    BayesianRidge(),  # 
    KernelRidge(
        alpha=0.6, kernel='polynomial', degree=2, coef0=2.5
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html
    ExtraTreesRegressor(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
    XGBRegressor(),
    AdaBoostRegressor(
        n_estimators=50
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
    BaggingRegressor(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
    DecisionTreeRegressor(
    ),  #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
    KNeighborsRegressor()
]  # https://scikit-learn.org/0.18/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

for m in models:
    print("- {}".format(m.__class__.__name__))
Esempio n. 24
0
files.download('SVR.csv')

from sklearn import metrics

#MAE
print(metrics.mean_absolute_error(y_test, prediction))

#MSE
print(metrics.mean_squared_error(y_test, prediction))

#RMSE
print(np.sqrt(metrics.mean_squared_error(y_test, prediction)))

from sklearn.ensemble import AdaBoostRegressor

model_ada = AdaBoostRegressor(n_estimators=100)
fit = model_ada.fit(X_train, y_train)

prediction_ada = model_ada.predict(X_test)

from matplotlib import pyplot as plt
plt.plot(t, y_test, 'bs', t, prediction_ada, 'g^')
plt.xlabel('Samples')
plt.ylabel('prediction')
plt.title('adaBoost')

Data = [prediction_ada, y_test]

Data = pd.DataFrame(Data)
Data = Data.T
Data
Esempio n. 25
0
                            max_iter=500,
                            n_jobs=-1)
    stda = StandardScaler(with_mean=False)
    processTest = pd.DataFrame(stda.fit_transform(x_test))
    processTrain = pd.DataFrame(stda.transform(x_train))
    lr_oof_train, lr_oof_test = sm.get_oof_tree(lr, processTrain, y_train,
                                                processTest, ntrain, ntest)
    # lr_oof_train,lr_oof_test = sm.get_oof_tree(lr,x_train,y_train,x_test,ntrain, ntest)

    # get_oof_regressor(clf, x_train, y_train, x_test, ntrain, ntest, NFOLDS = 5)
    rf = RandomForestRegressor(n_estimators=600,
                               max_depth=8,
                               n_jobs=-1,
                               random_state=SEED)
    ada = AdaBoostRegressor(n_estimators=60,
                            learning_rate=0.01,
                            loss='square',
                            random_state=SEED)
    gb = GradientBoostingRegressor(learning_rate=0.02,
                                   n_estimators=80,
                                   subsample=0.75,
                                   max_depth=6,
                                   random_state=SEED)
    et = ExtraTreesRegressor(n_estimators=150,
                             max_depth=8,
                             max_features='sqrt',
                             n_jobs=-1,
                             random_state=SEED)
    rf_reg_train, rf_reg_test = sm.get_oof_regressor(rf, x_train, y_train,
                                                     x_test, ntrain, ntest)
    ada_reg_train, ada_reg_test = sm.get_oof_regressor(ada, x_train, y_train,
                                                       x_test, ntrain, ntest)
Esempio n. 26
0
# more neighbours + weighting according to distance
train_display(KNeighborsRegressor(n_neighbors=5, weights='distance'), img)
train_display(KNeighborsRegressor(n_neighbors=25, weights='distance'), img)

# KNN
train_display(KNeighborsRegressor(n_neighbors=2, metric='canberra'), img)

# # gradient boosting
# train_display(XGBoostRegressor(max_depth=5, \
#                                n_estimators=100, \
#                                subsample=0.5, nthreads=4), img)

# # Gradient Boosting with deep trees
# train_display(XGBoostRegressor(max_depth=12, n_estimators=100, \
#                                subsample=0.5, nthreads=4, eta=0.1), img)

# # NN
# train_display(TheanetsRegressor(layers=[20, 20], hidden_activation='tanh',
#                                 trainers=[{'algo': 'adadelta', 'learning_rate': 0.01}]), img)

# AdaBoost over Decision Trees using random projections
base = make_pipeline(GaussianRandomProjection(n_components=10),
                     DecisionTreeRegressor(max_depth=10, max_features=5))
train_display(AdaBoostRegressor(base, n_estimators=50, learning_rate=0.05),
              img)

# Bagging over decision trees using random projections, sometimes referred as Random Forest
base = make_pipeline(GaussianRandomProjection(n_components=15),
                     DecisionTreeRegressor(max_depth=12, max_features=5))
train_display(BaggingRegressor(base, n_estimators=100), img)
regressorDT = DecisionTreeRegressor(random_state=0)
regressorDT.fit(x_train, y_train)
y_predDT = regressorDT.predict(x_test)
y_trainpredDT = regressorDT.predict(x_train)
print(np.sqrt(metrics.mean_squared_error(y_test, y_predDT)))
print(np.sqrt(metrics.mean_squared_error(y_train, y_trainpredDT)))

print('Variance score: %.2f' % metrics.r2_score(y_test, y_predDT))

# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(base_estimator=regressorDT,
                        learning_rate=1.0,
                        loss='linear',
                        n_estimators=50,
                        random_state=None)
ada.fit(x_train, y_train)
y_predada = ada.predict(x_test)
y_trainpredada = ada.predict(x_train)
print(np.sqrt(metrics.mean_squared_error(y_test, y_predada)))
print(np.sqrt(metrics.mean_squared_error(y_train, y_trainpredada)))

print('Variance score: %.2f' % metrics.r2_score(y_test, y_predada))
# ExtraTree Classifier

from sklearn.ensemble import ExtraTreesRegressor

extra = ExtraTreesRegressor(n_estimators=10,
                            criterion='mse',
pipelines.append(('KNN',
                  Pipeline([('Scaler', StandardScaler()),
                            ('KNN', KNeighborsRegressor())])))

pipelines.append(('DTR',
                  Pipeline([('Scaler', StandardScaler()),
                            ('DTR', DecisionTreeRegressor())])))

pipelines.append(('RF',
                  Pipeline([('Scaler', StandardScaler()),
                            ('RF', RandomForestRegressor())])))

pipelines.append(('ADA',
                  Pipeline([('Scaler', StandardScaler()),
                            ('ADA', AdaBoostRegressor())])))

pipelines.append(
    ('SVR', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())])))
pipelines.append(
    ('SVR-RBF',
     Pipeline([('Scaler', StandardScaler()),
               ('SVR', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1))])))
pipelines.append(('SVR-Linear',
                  Pipeline([('Scaler', StandardScaler()),
                            ('SVR', SVR(kernel='linear', C=100,
                                        gamma='auto'))])))
pipelines.append(('SVR-Poly',
                  Pipeline([('Scaler', StandardScaler()),
                            ('SVR',
                             SVR(kernel='poly',
Esempio n. 29
0
def train(classifier, X, Y, is_classf, outcome, fs_method, imp_method, 
    data_dir, results_dir, cv=10, verbose=0):
    results_path = os.path.join(results_dir, 
        'score_{}-{}-{}-{}.json'.format(classifier, outcome, fs_method, imp_method))
    if os.path.exists(results_path):
        if verbose:
            print("Model already trained. See {}".format(results_path))
        return
    if classifier == 'Linear':
        if is_classf:
            model = LogisticRegression()
        else:
            model = LinearRegression()
    elif classifier == 'Ridge':
        if is_classf:
            model = RidgeClassifierCV(alphas=(1e-3, 1e-2, 1e-1, 1, 10, 100))
        else:
            model = RidgeCV(alphas=(1e-3, 1e-2, 1e-1, 1, 10, 100))
        print("Finding best alpha...")
        model.fit(X, Y)
        best_alpha = model.alpha_
        print("Best alpha: {}".format(model.alpha_))
        if is_classf:
            model = RidgeClassifier(alpha=best_alpha) 
        else:
            model = Ridge(alpha=best_alpha) 
    elif classifier == 'AdaBoost':
        if is_classf:
            estimator = DecisionTreeClassifier(max_depth=1) 
            model = AdaBoostClassifier(estimator, n_estimators=100) 
        else: 
            estimator = DecisionTreeRegressor(max_depth=1)
            model = AdaBoostRegressor(estimator, n_estimators=100)
    elif classifier == 'RandomForest':
        if is_classf:
            model = RandomForestClassifier(n_estimators=50)
        else:
            model = RandomForestRegressor(n_estimators=50)
    elif classifier == 'SVM':
        if is_classf:
            model = SVC(kernel='linear', probability=True)
        else:
            model = SVR(kernel='linear')
    else:
        raise ValueError("model {} not available".format(classifier))

    scores = {}
    metric = brier_score_loss if is_classf else mean_squared_error
    metric_str = 'brier_loss' if is_classf else 'mse'
    if verbose:
        print("Training {}...".format(classifier))
        print("10-Fold cross validation...")
    start = time.time()

    kf = KFold(n_splits=cv)
    kf.get_n_splits(X)
    losses = []
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        if verbose:
            sys.stdout.write("\rFold {}/{}".format(i+1, cv))
        X_train, Y_train = X[train_index], Y[train_index]
        X_test, Y_test = X[test_index], Y[test_index] 
        model.fit(X_train, Y_train)
        if classifier == 'Ridge' and is_classf:
            d = model.decision_function(X_test)
            Y_pred = np.exp(d) / (1 + np.exp(d))
        else:
            Y_pred = model.predict_proba(X_test)[:,1] if is_classf else model.predict(X_test)
        losses.append(metric(Y_test, Y_pred))
    mean_loss = np.mean(losses)
    scores['cv_{}'.format(metric_str)] = mean_loss

    total = int(time.time()-start)
    if verbose:
        print("\nTraining took {}m{}s.".format(total // 60, total % 60))
        print("cv mean {}: {:.4f}".format(metric_str, mean_loss))
        print("Bootstrapping...B)")
    bs_losses = []
    for i in range(cv):
        if verbose:
            sys.stdout.write("\rSample {}/{}".format(i+1, cv))
        data = np.hstack((np.arange(len(X)).reshape(len(X), 1), X, Y.reshape(len(Y), 1)))
        train = resample(data, n_samples=int(0.7*len(X)))
        train_ids = set(train[:,0].astype(np.int64))
        train = train[:,1:]
        test = np.array([sample[1:] for sample in data if sample[0] not in train_ids])
        X_train, Y_train = train[:,:-1], train[:,-1]
        X_test, Y_test = test[:,:-1], test[:,-1]
        model.fit(X_train, Y_train)
        if classifier == 'Ridge' and is_classf:
            # from https://stackoverflow.com/questions/22538080/scikit-learn-ridge-classifier-extracting-class-probabilities
            d = model.decision_function(X_test)
            Y_pred = np.exp(d) / (1 + np.exp(d))
        else:
            Y_pred = model.predict_proba(X_test)[:,1] if is_classf else model.predict(X_test)
        bs_losses.append(metric(Y_test, Y_pred))
    mean_loss = np.mean(bs_losses)
    n = len(bs_losses)
    lower = mean_loss - 1.96*np.std(bs_losses)/np.sqrt(n)
    upper = mean_loss + 1.96*np.std(bs_losses)/np.sqrt(n)
    scores['bootstrap_{}'.format(metric_str)] = mean_loss
    scores['bootstrap_95_lower'] = lower
    scores['bootstrap_95_upper'] = upper

    if verbose:
        print("\nbootstrap mean {}: {:.4f}".format(metric_str, mean_loss))
        print("95% confidence interval: [{:.4f}, {:.4f}]".format(lower, upper))

    with open(results_path, 'w') as f: 
        json.dump(scores, f)
        if verbose:
            print("Successfully saved scores.")
def evaluateIndividualregressors(x, y, train_size_pct):
    """
    evaluateIndividualregressors
        x : The features of the dataset to be used for predictions
        y : The target class for each row in "x"
        train_size_pct : {float in the range(0.0, 1.0)} the percentage of the dataset that should be used for training
    """
    max_depth_x2 = MAX_DEPTH * 2
    n_neighbors_x2 = N_NEIGHBORS * 2

    lr1 = LinearRegression()
    rf_x2 = RandomForestRegressor(max_depth=max_depth_x2, random_state=SEED)
    et = ExtraTreesRegressor(max_depth=MAX_DEPTH, random_state=SEED)
    dectree = DecisionTreeRegressor(max_depth=MAX_DEPTH, random_state=SEED)
    knn = KNeighborsRegressor(n_neighbors=N_NEIGHBORS)
    knn_x2 = KNeighborsRegressor(n_neighbors=n_neighbors_x2)
    knn3 = KNeighborsRegressor(n_neighbors=20, metric='euclidean')
    dumm = DummyRegressor()
    knb = neighbors.KNeighborsRegressor()
    SVR1 = MultiOutputRegressor(NuSVR())
    ada1 = MultiOutputRegressor(AdaBoostRegressor())
    gpc1 = GaussianProcessRegressor()
    bag = BaggingRegressor(base_estimator=ExtraTreesRegressor(),
                           n_estimators=10,
                           random_state=0)
    svr1 = MultiOutputRegressor(SVR())
    r1 = Ridge()
    r2 = RidgeCV()
    xgbrf = MultiOutputRegressor(XGBRFRegressor())
    xgb = MultiOutputRegressor(XGBRegressor())
    gbr = MultiOutputRegressor(
        GradientBoostingRegressor(n_estimators=100,
                                  learning_rate=0.1,
                                  max_depth=1,
                                  random_state=0,
                                  loss='squared_error'))
    lasso = MultiTaskLassoCV(random_state=42)
    Bay = MultiOutputRegressor(linear_model.BayesianRidge())
    lassolars = linear_model.LassoLars(alpha=.1, normalize=False)
    linsvr = MultiOutputRegressor(LinearSVR())
    regressor_mapping = {
        f'1-linear regression': lr1,
        f'2-RandomForest case2-{max_depth_x2}': rf_x2,
        f'3-ExtraTrees-{MAX_DEPTH}': et,
        f'4-DecisionTree-{MAX_DEPTH}': dectree,
        f'5-KNeighbors case1-{N_NEIGHBORS}': knn,
        f'5-KNeighbors case2-{n_neighbors_x2}': knn_x2,
        f'6-knn case 3': knn3,
        f'7-dummy-': dumm,
        f'8-neighbors.KNeighbors-': knb,
        f'9-NuSVR-': SVR1,
        f'10- adaboost-': ada1,
        f'11- GaussianProcessRegressor': gpc1,
        f'12- bagging': bag,
        f'13- svr1': svr1,
        f'14- ridge': r1,
        f'15- ridgecv': r2,
        f'16- xgbrf': xgbrf,
        f'17- xgboost': xgb,
        f'18- GradientBoosting': gbr,
        f'19- lasso': lasso,
        f'20- BayesianRidge': Bay,
        f'21- lassolars': lassolars,
        f'22- linsvr': linsvr
    }

    for model_name, model in regressor_mapping.items():

        train_test_model(model_name, model, x, y, train_size_pct)