def train_mimic(self, training_data, mimic_env, save_model_dir, log_file):
        self.model = DecisionTreeRegressor(max_leaf_nodes=self.max_leaf_nodes,
                                           criterion= self.criterion,
                                           splitter=self.mode)
        self.model.fit(training_data[0], training_data[1])
        # self.print_tree()
        leaves_number = (self.model.tree_.node_count+1)/2
        print("Leaves number is {0}".format(leaves_number))
        predict_dictionary = {}
        predictions = self.model.predict(training_data[0])
        for predict_index in range(len(predictions)):
            predict_value = predictions[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value:[predict_index]})

        return_value_log = mimic_env.get_return(state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(state=list(predict_dictionary.values()), apply_variance_reduction=True)
        mae, rmse = compute_regression_results(predictions=predictions, labels=training_data[1])
        # print("Training return:{0} with mae:{1} and rmse:{2}".format(return_value, mae, rmse), file=log_file)

        with open(save_model_dir, 'wb') as f:
            pickle.dump(obj=self.model, file=f)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
Ejemplo n.º 2
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        if "depth" in parameters:
            model = DecisionTreeRegressor(max_depth = parameters["depth"], random_state=42)
        elif "leaf" in parameters:
            model = DecisionTreeRegressor(min_samples_leaf = parameters["leaf"], random_state=42)
        elif "max_leaf" in parameters:
            model = DecisionTreeRegressor(max_leaf_nodes = parameters["max_leaf"], random_state=42)
            
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
    return rmseEval(all_obs, all_pred)[1]
Ejemplo n.º 3
0
def getModels():
    models = {}
    models['dt'] = DecisionTreeRegressor(max_depth=50)
    models['rf1'] = RandomForestRegressor()
    models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15)
    models['gbr'] = GradientBoostingRegressor(n_estimators=128,
                                              max_depth=5,
                                              learning_rate=1.0)
    # models['abr'] = AdaBoostRegressor(n_estimators=128)
    return models
Ejemplo n.º 4
0
    def fit(self, X, y, sample_weight=None):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths of X and y'
        X = pandas.DataFrame(X)
        y = numpy.array(column_or_1d(y), dtype=int)
        assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported'
        self.check_params()

        self.estimators = []
        self.scores = []

        n_samples = len(X)
        n_inbag = int(self.subsample * len(X))
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)

        # preparing for fitting in trees
        X = self.get_train_vars(X)
        self.n_features = X.shape[1]
        X, y = check_arrays(X, y)
        X = X.astype(DTYPE)
        y_pred = numpy.zeros(len(X), dtype=float)

        if self.init_estimator is not None:
            y_signed = 2 * y - 1
            self.init_estimator.fit(X, y_signed, sample_weight=sample_weight)
            y_pred += numpy.ravel(self.init_estimator.predict(X))

        for stage in range(self.n_estimators):
            # tree creation
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features,
                random_state=self.random_state,
                max_leaf_nodes=self.max_leaf_nodes)

            # tree learning
            residual = self.loss.negative_gradient(y_pred)
            train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False)

            tree.fit(X[train_indices], residual[train_indices],
                     sample_weight=sample_weight[train_indices], check_input=False)
            # update tree leaves
            if self.update_tree:
                self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight,
                                      update_mask=numpy.ones(len(X), dtype=bool), residual=residual)

            y_pred += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
            self.scores.append(self.loss(y_pred))
        return self
Ejemplo n.º 5
0
 def addBoostIteration(self):
     rv = self.regressionValues()
     trees = []
     mask = numpy.array([True] * self.nF)
     for i in range(0, self.nF):
         mask[:] = True
         mask[i] = False
         tree = DecisionTreeRegressor(max_depth=self.max_depth)
         tree.fit(self.data[:, mask], rv[:, i])
         # newpsis[:, i] = tree.predict(self.data[:, mask])
         trees.append(tree)
     self.trees.append(trees)
Ejemplo n.º 6
0
    def set_params_dict(self, learner_params):

        if self.method == 'classification':
            self.learner = ensemble.AdaBoostClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=learner_params['base_estimator__max_depth'],
                    max_features=learner_params['base_estimator__max_features']
                ),
                n_estimators=int(learner_params['n_estimators']),
                learning_rate=learner_params['learning_rate'])

        elif self.method == 'regression':
            self.learner = ensemble.AdaBoostRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=learner_params['base_estimator__max_depth'],
                    max_features=learner_params['base_estimator__max_features']
                ),
                n_estimators=int(learner_params['n_estimators']),
                learning_rate=learner_params['learning_rate'])
Ejemplo n.º 7
0
    def fit_stage(self, i, X, y):
        """Fit another stage of ``n_classes_`` trees to the boosting model. """

        # induce regression tree on residuals
        tree = DecisionTreeRegressor(criterion='friedman_mse',
                                     splitter='best',
                                     max_depth=self.max_depth,
                                     min_samples_split=self.min_samples_split,
                                     min_samples_leaf=self.min_samples_leaf,
                                     min_weight_fraction_leaf=0.,
                                     max_features=None,
                                     max_leaf_nodes=None,
                                     random_state=self.random_state,
                                     presort=False)

        tree.fit(X, y, check_input=False, X_idx_sorted=None)

        # add tree to ensemble
        self.estimators[i, 0] = tree
        self.n_estimated = i + 1
Ejemplo n.º 8
0
    def set_params_list(self, learner_params, i):

        m_rf_size = int(learner_params[0])
        m_learn_rate = learner_params[1]
        m_dep = int(learner_params[2])
        m_feat = learner_params[3]

        if self.method == 'classification':
            self.learner = ensemble.AdaBoostClassifier(
                base_estimator=DecisionTreeClassifier(max_depth=m_dep,
                                                      max_features=m_feat),
                n_estimators=int(m_rf_size),
                learning_rate=m_learn_rate)

        elif self.method == 'regression':
            self.learner = ensemble.AdaBoostRegressor(
                base_estimator=DecisionTreeRegressor(max_depth=m_dep,
                                                     max_features=m_feat),
                n_estimators=int(m_rf_size),
                learning_rate=m_learn_rate)
Ejemplo n.º 9
0
def sklearn_titanic_regression():
    from sklearn.tree.tree import DecisionTreeRegressor
    from sklearn.preprocessing.label import LabelEncoder
    import numpy as np
    total_df = pd.read_csv("titanic_clean.csv")
    total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True)
    total_df.dropna(inplace=True)
    for col in total_df.columns.tolist():
        if str(total_df[col].dtype) == 'object':
            total_df[col] = LabelEncoder().fit_transform(total_df[col])

    total_num = total_df.shape[0]
    train_df = total_df.iloc[:int(total_num * 0.8)]
    test_df = total_df.iloc[int(total_num * 0.8):]

    clf = DecisionTreeRegressor()
    clf.fit(train_df.drop(['fare'], axis=1), train_df['fare'])
    pred = clf.predict(test_df.drop(['fare'], axis=1))
    truth = test_df['fare']
    mse = np.sum(np.square(pred - truth)) / test_df.shape[0]
    print(mse)
Ejemplo n.º 10
0
def _hi_level_investigation(data):
    '''Perform high-level investigation.'''
    transformers = [
        transformer.OneHotTransformer(nucl=False),
        transformer.AminoAcidTransformer()
    ]

    estimators = [
        LinearRegression(),
        DecisionTreeRegressor(),
        RandomForestRegressor(),
        ExtraTreesRegressor(),
        GradientBoostingRegressor(),
        SVR(kernel='poly')
    ]

    cv = 10

    for trnsfrmr, estimator in itertools.product(transformers, estimators):
        encoded = trnsfrmr.transform(data)
        X, y = encoded[:, 2:], encoded[:, 1]
        X = StandardScaler().fit_transform(X)

        scores = cross_val_score(estimator,
                                 X,
                                 y,
                                 scoring='neg_mean_squared_error',
                                 cv=cv,
                                 verbose=False)
        scores = np.sqrt(-scores)

        print('\t'.join([
            trnsfrmr.__class__.__name__, estimator.__class__.__name__,
            str((scores.mean(), scores.std()))
        ]))

    print()
 def __regressor__(self, X_train, Y_train):
     self.ensemble = DecisionTreeRegressor(random_state=56)
     self.ensemble.fit(X_train, Y_train)
     print('Ensemble Model Ready')
Ejemplo n.º 12
0
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto")
	build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(), "LarsAuto")
	build_auto(LassoCV(random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(), "LassoLarsAuto")
	build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11)
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
Ejemplo n.º 13
0
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")


if "Auto" in datasets:
    build_auto(
        AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                                min_samples_leaf=5),
                          random_state=13,
                          n_estimators=17), "AdaBoostAuto")
    build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
    build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
    build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
               "DecisionTreeAuto",
               compact=False)
    build_auto(
        BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                               min_samples_leaf=5),
                         random_state=13,
                         n_estimators=3,
                         max_features=0.5), "DecisionTreeEnsembleAuto")
    build_auto(DummyRegressor(strategy="median"), "DummyAuto")
    build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
 'elastic_net': {
     'max_iter': [5, 10, 15],
     'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
     'l1_ratio': np.arange(0.0, 1.0, 0.1)
 },
 'extra_trees': {
     "n_estimators": [80],
     'max_depth': [
         30,
     ],
     'max_features': ['auto', 'sqrt', 'log2'],
     'min_samples_split': [0.01, 0.05, 0.10],
     'min_samples_leaf': [0.005, 0.05, 0.10],
 },
 'bagging': {
     "base_estimator": [DecisionTreeRegressor(max_depth=8)],
     "n_estimators": [200],
     "max_features": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 },
 'sgd': {
     "alpha":
     [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0],
     "penalty": ["l1", "l2"],
     "loss": [
         'squared_loss', 'huber', 'epsilon_insensitive',
         'squared_epsilon_insensitive'
     ]
 },
 'linear_svr': {
     "C": [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100],
     "loss": ['epsilon_insensitive', 'squared_epsilon_insensitive']
Ejemplo n.º 15
0
from sympy.core.numbers import RealNumber
from sympy.functions.elementary.piecewise import Piecewise
from sympy.core.symbol import Symbol
import pandas
from nose.tools import assert_almost_equal

# Create some data
m = 10000
X = np.random.normal(size=(m, 10))
thresh = np.random.normal(size=10)
X_transformed = X * (X > thresh)
beta = np.random.normal(size=10)
y = np.dot(X_transformed, beta) + np.random.normal(size=m)

# Train a decision tree regressor
model = DecisionTreeRegressor()
model.fit(X, y)
print model.score(X, y)


# Inspect
def _sym_predict_decision_tree(model,
                               names,
                               current_node=0,
                               output_idx=0,
                               class_idx=0):
    left = model.tree_.children_left[current_node]
    right = model.tree_.children_right[current_node]
    if left == -1:
        assert right == -1
        left_expr = RealNumber(model.tree_.value[current_node, output_idx,
Ejemplo n.º 16
0
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X,
                            y,
                            dtype=DTYPE,
                            sparse_format="dense",
                            check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(
            y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration,
                                                self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X,
                                           dtype=DTYPE,
                                           sparse_format="dense",
                                           check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(
                        cl.predict(X) * rate for rate, cl in zip(
                            self.learning_rates, self.classifiers))

            self.loss_values.append(
                self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration,
                                      self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(
                len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(
                len(sample_weight))
            residual = self.loss.negative_gradient(y,
                                                   y_pred,
                                                   sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask],
                     check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag,
                                                      self.random_state)
                self.loss.update_terminal_regions(tree.tree_,
                                                  X=iter_X,
                                                  y=y,
                                                  residual=residual,
                                                  pred=y_pred,
                                                  sample_mask=update_mask,
                                                  sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Ejemplo n.º 17
0
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples,
                                     n_features=n_features,
                                     distance=distance)
    testX, testY = generate_sample(n_samples=n_samples,
                                   n_features=n_features,
                                   distance=distance)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    shift = numpy.random.normal(size=[1, n_features]) * 5
    trainX = numpy.dot(trainX.values, multiplier) + shift
    testX = numpy.dot(testX.values, multiplier) + shift

    boosters = {
        'old_boost':
        GradientBoostingClassifier(n_estimators=100,
                                   min_samples_split=50,
                                   max_depth=5,
                                   subsample=0.3),
        'fast+old_tree':
        CommonGradientBoosting(n_estimators=100,
                               base_estimator=DecisionTreeRegressor(
                                   min_samples_split=50, max_depth=5)),
        'fast+neuro':
        TreeGradientBoostingClassifier(
            n_estimators=100,
            update_tree=True,
            base_estimator=FastNeuroTreeRegressor()),
        'fold+tree':
        FoldingGBClassifier(loss=BinomialDeviance(),
                            n_estimators=10,
                            update_tree=True,
                            base_estimator=FastNeuroTreeRegressor()),
        'ugb':
        uGradientBoostingClassifier(loss=AdaLossFunction(),
                                    n_estimators=100,
                                    min_samples_split=50,
                                    max_depth=5,
                                    update_tree=True,
                                    subsample=0.3)
    }

    for criterion in [
            'mse',  # 'fmse', # 'pvalue',
            # 'significance',
            'significance2',
            # 'gini',
            'entropy',
            'poisson'
    ]:
        boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(
            n_estimators=100,
            update_tree=True,
            base_estimator=FastTreeRegressor(criterion=criterion))

    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
     'clf__max_depth': range(5, 200, 10),
     'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1],
     'clf__max_features': ['auto', 'sqrt', 'log2', None],
     'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60]
 },
 'random_forest': {
     'clf__n_estimators': range(5, 200, 10),
     'clf__max_depth': range(5, 200, 10),
     'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1],
     'clf__max_features': ['auto', 'sqrt', 'log2', None],
     'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60]
 },
 'ada_boost': {
     'clf__base_estimator': [DecisionTreeRegressor(max_depth=ii) for ii in range(10, 110, 10)],
     'clf__n_estimators': range(50, 200, 10),
     'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     'clf__loss': ['linear', 'square', 'exponential'],
 },
 'gradient_boost': {
     'clf__loss': ['ls', 'lad', 'huber', 'quantile'],
     'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     'clf__n_estimators': range(100, 350, 10),
     'clf__max_depth': range(5, 200, 10),
     'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1],
     'clf__max_features': ['auto', 'sqrt', 'log2', None],
     'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60]
 },
 'cat_boost': {
Ejemplo n.º 19
0
# load the data
data = {}
columns = []
loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns)

all_features = deepcopy(columns)
all_features.remove("target")
all_features.remove("location")

output = open(OUTPUT_DATA_FILE, 'w')
output.write("location,observation,prediction\n")

for location in locations:
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, all_features, "target")
    model = DecisionTreeRegressor(max_depth=10, random_state=42)

    model.fit(trainX, trainY)
    prediction = model.predict(testX)

    for i in range(0, len(testY)):
        output.write(str(location))
        output.write(",")
        output.write(str(testY[i]))
        output.write(",")
        output.write(str(prediction[i]))
        output.write("\n")

output.close()
Ejemplo n.º 20
0
    # HistGradientBoostingClassifier(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    AdaBoostRegressor(n_estimators=200, random_state=randomstate),
    GaussianProcessRegressor(normalize_y=True),
    ARDRegression(),
    # HuberRegressor(),   # epsilon:  greater than 1.0, default 1.35
    LinearRegression(n_jobs=5),
    PassiveAggressiveRegressor(
        random_state=randomstate),  # C: 0.25, 0.5, 1, 5, 10
    SGDRegressor(random_state=randomstate),
    TheilSenRegressor(n_jobs=5, random_state=randomstate),
    RANSACRegressor(random_state=randomstate),
    KNeighborsRegressor(
        weights='distance'),  # n_neighbors: 3, 6, 9, 12, 15, 20
    RadiusNeighborsRegressor(weights='distance'),  # radius: 1, 2, 5, 10, 15
    MLPRegressor(max_iter=10000000, random_state=randomstate),
    DecisionTreeRegressor(
        random_state=randomstate),  # max_depth = 2, 3, 4, 6, 8
    ExtraTreeRegressor(random_state=randomstate),  # max_depth = 2, 3, 4, 6, 8
    SVR()  # C: 0.25, 0.5, 1, 5, 10
]

selectors = [
    reliefF.reliefF,
    fisher_score.fisher_score,
    # chi_square.chi_square,
    JMI.jmi,
    CIFE.cife,
    DISR.disr,
    MIM.mim,
    CMIM.cmim,
    ICAP.icap,
    MRMR.mrmr,
Ejemplo n.º 21
0
data['Sex'] = label_encoder.transform(data['Sex'])

enc = LabelEncoder()
label_encoder = enc.fit(data[pd.notnull(data['Floor'])]['Floor'].values)
transformed = label_encoder.transform(data[pd.notnull(
    data['Floor'])]['Floor'].values)
indexes = pd.notnull(data.Floor)
data.loc[indexes, 'Floor'] = transformed

enc = LabelEncoder()
label_encoder = enc.fit(data['Embarked'])
data['Embarked'] = label_encoder.transform(data['Embarked'])

## predykcja wieku
# TODO: zobaczyć predykcję również tylko po Title
regresor = DecisionTreeRegressor()
X_train_age = data[pd.notnull(data.Age)][['Title', 'SibSp', 'Parch']]
y_train_age = data[pd.notnull(data.Age)][['Age']]
regresor.fit(X_train_age, y_train_age)

# TODO: sprawdzić tą predykcję wieku, działa chyba ok
# data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None)
data['Age'] = np.where(pd.isnull(data.Age),
                       regresor.predict(data[['Title', 'SibSp', 'Parch']]),
                       data['Age'])

##predykcja poziomu
classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2)
#

X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']]
    ])

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_test_prepared = full_pipeline.fit_transform(housing_test)

model_maps = dict()
model_maps["Linear_Regression"] = LinearRegression()
model_maps["Logistic_Regression"] = LogisticRegression(random_state=42, n_jobs=-1)
model_maps["DecisionTreeRegressor"] = DecisionTreeRegressor(random_state=42)
model_maps["RandomForestRegressor"] = RandomForestRegressor(random_state=42, n_jobs=-1)
model_maps["SupportVectorRegressor"] = SVR(kernel="linear")

results = pd.DataFrame(columns=["Hardware", "ExpID", "RMSETrainCF", "RMSETest", "MAPETrainCF", "MAPETest", "p-value", "TrainTime(s)", "TestTime(s)", "Experiment description"])

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)


def trainStep(algo, indx, name):
    print("starting " + str(name) + " training")
    results.loc[indx] = ["Corei3/8GB", indx + 1, 0, 0, 0, 0, 0, 0, 0, "Training " + str(name)]
    start_time = time.time()
    algo.fit(housing_prepared, housing_labels)
    results.loc[indx, "TrainTime(s)"] = time.time() - start_time
    print("ends " + str(name) + " training")
Ejemplo n.º 23
0
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
			'ElasticNet':ElasticNet(),
			'ElasticNetCV':ElasticNetCV(),
			'EmpiricalCovariance':EmpiricalCovariance(),
			'ExtraTreeClassifier':ExtraTreeClassifier(),
			'ExtraTreeRegressor':ExtraTreeRegressor(),
			'ExtraTreesClassifier':ExtraTreesClassifier(),
			'ExtraTreesRegressor':ExtraTreesRegressor(),
			'FactorAnalysis':FactorAnalysis(),
			'FastICA':FastICA(),
			'FeatureAgglomeration':FeatureAgglomeration(),
			'FunctionTransformer':FunctionTransformer(),
			'GMM':GMM(),
			'GaussianMixture':GaussianMixture(),
			'GaussianNB':GaussianNB(),
Ejemplo n.º 24
0
    11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20,
    332.09, 12.13
]

predictions = []
predictions2 = []
predictions3 = []
predictions4 = []
offset = int(0.7 * len(X))

for i in range(10):
    X, y = shuffle(boston.data, boston.target)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140)
    regressor2 = DecisionTreeRegressor(max_depth=6)
    regressor3 = LinearRegression()
    regressor4 = RandomForestRegressor()
    regressor.fit(X_train, y_train)
    regressor2.fit(X_train, y_train)
    regressor3.fit(X_train, y_train)
    regressor4.fit(X_train, y_train)
    y_pred = regressor.predict(x)
    y_pred2 = regressor2.predict(x)
    y_pred3 = regressor3.predict(x)
    y_pred4 = regressor4.predict(x)
    predictions.append(y_pred)
    predictions2.append(y_pred2)
    predictions3.append(y_pred3)
    predictions4.append(y_pred4)
    print "\nPrediction = " + str(y_pred)
Ejemplo n.º 25
0
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name)
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name)


if "Auto" in datasets:
    build_auto(
        AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                                min_samples_leaf=5),
                          random_state=13,
                          n_estimators=17), "AdaBoostAuto")
    build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
    build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
    build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
               "DecisionTreeAuto",
               compact=False)
    build_auto(
        BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                               min_samples_leaf=5),
                         random_state=13,
                         n_estimators=3,
                         max_features=0.5), "DecisionTreeEnsembleAuto")
    build_auto(DummyRegressor(strategy="median"), "DummyAuto")
    build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
Ejemplo n.º 26
0
@author: TF
'''
import matplotlib.pyplot as plt
import numpy as np
from numpy import *
from sklearn.tree.tree import DecisionTreeRegressor


def plotfigure(X, X_test, y, yp):
    plt.figure()
    plt.scatter(X, y, c='k', label='data')
    plt.plot(X_test, yp, c='r', label='max_depth = 5', linewidth=2)
    plt.xlabel('data')
    plt.ylabel('target')
    plt.title('Decision Tree Regression')
    plt.legend()
    plt.show()


x = np.linspace(-5, 5, 200)
siny = np.sin(x)
X = mat(x).T
y = siny + np.random.rand(1, len(siny)) * 1.5
y = y.tolist()[0]
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X, y)

X_test = np.arange(-5.0, 5.0, 0.05)[:, np.newaxis]
yp = clf.predict(X_test)

plotfigure(X, X_test, y, yp)
Ejemplo n.º 27
0
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto")
	build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(), "LarsAuto")
	build_auto(LassoCV(random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(), "LassoLarsAuto")
	build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11)
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
Ejemplo n.º 28
0
store_pkl(auto_mapper, "Auto.pkl")

auto_X = auto[:, 0:9]
auto_y = auto[:, 9]

print(auto_X.dtype, auto_y.dtype)


def build_auto(regressor, name):
    regressor = regressor.fit(auto_X, auto_y)
    store_pkl(regressor, name + ".pkl")
    mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")


build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5),
           "DecisionTreeAuto")
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
                   random_state, X_idx_sorted, X_csc=None, X_csr=None):
        """Fit another stage of ``n_classes_`` trees to the boosting model. """

        assert sample_mask.dtype == np.bool
        loss = self.loss_
        original_y = y

        for k in range(loss.K):
            if loss.is_multi_class:
                y = np.array(original_y == k, dtype=np.float64)

            residual = loss.negative_gradient(y, y_pred, k=k,
                                              sample_weight=sample_weight)

            # induce regression tree on residuals
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter='best',
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                min_impurity_split=self.min_impurity_split,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                random_state=random_state,
                presort=self.presort)

            if self.subsample < 1.0:
                # no inplace multiplication!
                sample_weight = sample_weight * sample_mask.astype(np.float64)

            if X_csc is not None:
                tree.fit(X_csc, residual, sample_weight=sample_weight,
                         check_input=False, X_idx_sorted=X_idx_sorted)
            else:
                tree.fit(X, residual, sample_weight=sample_weight,
                         check_input=False, X_idx_sorted=X_idx_sorted)


            # update tree leaves
            
            if i == 0:
                if X_csr is not None:
                    loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
                                                 sample_weight, sample_mask, function(i), k=k)
                else:
                    loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
                                                 sample_weight, sample_mask,
                                                 function(i), k=k)
                
                # add tree to ensemble
                self.estimators_[i, k] = tree
                
                return y_pred
            else:
                if X_csr is not None:
                    loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
                                                 sample_weight, sample_mask, function(i), k=k)
                else:
                    loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
                                                 sample_weight, sample_mask,
                                                 function(i), k=k)
                
                # add tree to ensemble
                self.estimators_[i, k] = tree
            
            
            return y_pred
Ejemplo n.º 30
0
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = rmseEval(testY, prediction)[1]
    lr2Rmse.append(rmse)
    print("\trmse: " + str(rmse))
    lr2Data[location] = {}
    for i in range(0, len(testY)):
        timestamp = testTimestamp[i]
        value = prediction[i]
        lr2Data[location][timestamp] = value

    # dtr
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, allFeatures, "target", timestampData)
    print("\tDTR #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = DecisionTreeRegressor(max_leaf_nodes=15, random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = rmseEval(testY, prediction)[1]
    dtrRmse.append(rmse)
    print("\trmse: " + str(rmse))
    dtrData[location] = {}
    for i in range(0, len(testY)):
        timestamp = testTimestamp[i]
        value = prediction[i]
        dtrData[location][timestamp] = value

    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, allFeatures, "target", timestampData)
    print("\tRFR #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,