def train_mimic(self, training_data, mimic_env, save_model_dir, log_file): self.model = DecisionTreeRegressor(max_leaf_nodes=self.max_leaf_nodes, criterion= self.criterion, splitter=self.mode) self.model.fit(training_data[0], training_data[1]) # self.print_tree() leaves_number = (self.model.tree_.node_count+1)/2 print("Leaves number is {0}".format(leaves_number)) predict_dictionary = {} predictions = self.model.predict(training_data[0]) for predict_index in range(len(predictions)): predict_value = predictions[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value:[predict_index]}) return_value_log = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return(state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return(state=list(predict_dictionary.values()), apply_variance_reduction=True) mae, rmse = compute_regression_results(predictions=predictions, labels=training_data[1]) # print("Training return:{0} with mae:{1} and rmse:{2}".format(return_value, mae, rmse), file=log_file) with open(save_model_dir, 'wb') as f: pickle.dump(obj=self.model, file=f) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") if "depth" in parameters: model = DecisionTreeRegressor(max_depth = parameters["depth"], random_state=42) elif "leaf" in parameters: model = DecisionTreeRegressor(min_samples_leaf = parameters["leaf"], random_state=42) elif "max_leaf" in parameters: model = DecisionTreeRegressor(max_leaf_nodes = parameters["max_leaf"], random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def getModels(): models = {} models['dt'] = DecisionTreeRegressor(max_depth=50) models['rf1'] = RandomForestRegressor() models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15) models['gbr'] = GradientBoostingRegressor(n_estimators=128, max_depth=5, learning_rate=1.0) # models['abr'] = AdaBoostRegressor(n_estimators=128) return models
def fit(self, X, y, sample_weight=None): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' self.check_params() self.estimators = [] self.scores = [] n_samples = len(X) n_inbag = int(self.subsample * len(X)) self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) # preparing for fitting in trees X = self.get_train_vars(X) self.n_features = X.shape[1] X, y = check_arrays(X, y) X = X.astype(DTYPE) y_pred = numpy.zeros(len(X), dtype=float) if self.init_estimator is not None: y_signed = 2 * y - 1 self.init_estimator.fit(X, y_signed, sample_weight=sample_weight) y_pred += numpy.ravel(self.init_estimator.predict(X)) for stage in range(self.n_estimators): # tree creation tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state, max_leaf_nodes=self.max_leaf_nodes) # tree learning residual = self.loss.negative_gradient(y_pred) train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False) tree.fit(X[train_indices], residual[train_indices], sample_weight=sample_weight[train_indices], check_input=False) # update tree leaves if self.update_tree: self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight, update_mask=numpy.ones(len(X), dtype=bool), residual=residual) y_pred += self.learning_rate * tree.predict(X) self.estimators.append(tree) self.scores.append(self.loss(y_pred)) return self
def addBoostIteration(self): rv = self.regressionValues() trees = [] mask = numpy.array([True] * self.nF) for i in range(0, self.nF): mask[:] = True mask[i] = False tree = DecisionTreeRegressor(max_depth=self.max_depth) tree.fit(self.data[:, mask], rv[:, i]) # newpsis[:, i] = tree.predict(self.data[:, mask]) trees.append(tree) self.trees.append(trees)
def set_params_dict(self, learner_params): if self.method == 'classification': self.learner = ensemble.AdaBoostClassifier( base_estimator=DecisionTreeClassifier( max_depth=learner_params['base_estimator__max_depth'], max_features=learner_params['base_estimator__max_features'] ), n_estimators=int(learner_params['n_estimators']), learning_rate=learner_params['learning_rate']) elif self.method == 'regression': self.learner = ensemble.AdaBoostRegressor( base_estimator=DecisionTreeRegressor( max_depth=learner_params['base_estimator__max_depth'], max_features=learner_params['base_estimator__max_features'] ), n_estimators=int(learner_params['n_estimators']), learning_rate=learner_params['learning_rate'])
def fit_stage(self, i, X, y): """Fit another stage of ``n_classes_`` trees to the boosting model. """ # induce regression tree on residuals tree = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0., max_features=None, max_leaf_nodes=None, random_state=self.random_state, presort=False) tree.fit(X, y, check_input=False, X_idx_sorted=None) # add tree to ensemble self.estimators[i, 0] = tree self.n_estimated = i + 1
def set_params_list(self, learner_params, i): m_rf_size = int(learner_params[0]) m_learn_rate = learner_params[1] m_dep = int(learner_params[2]) m_feat = learner_params[3] if self.method == 'classification': self.learner = ensemble.AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=m_dep, max_features=m_feat), n_estimators=int(m_rf_size), learning_rate=m_learn_rate) elif self.method == 'regression': self.learner = ensemble.AdaBoostRegressor( base_estimator=DecisionTreeRegressor(max_depth=m_dep, max_features=m_feat), n_estimators=int(m_rf_size), learning_rate=m_learn_rate)
def sklearn_titanic_regression(): from sklearn.tree.tree import DecisionTreeRegressor from sklearn.preprocessing.label import LabelEncoder import numpy as np total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeRegressor() clf.fit(train_df.drop(['fare'], axis=1), train_df['fare']) pred = clf.predict(test_df.drop(['fare'], axis=1)) truth = test_df['fare'] mse = np.sum(np.square(pred - truth)) / test_df.shape[0] print(mse)
def _hi_level_investigation(data): '''Perform high-level investigation.''' transformers = [ transformer.OneHotTransformer(nucl=False), transformer.AminoAcidTransformer() ] estimators = [ LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor(), SVR(kernel='poly') ] cv = 10 for trnsfrmr, estimator in itertools.product(transformers, estimators): encoded = trnsfrmr.transform(data) X, y = encoded[:, 2:], encoded[:, 1] X = StandardScaler().fit_transform(X) scores = cross_val_score(estimator, X, y, scoring='neg_mean_squared_error', cv=cv, verbose=False) scores = np.sqrt(-scores) print('\t'.join([ trnsfrmr.__class__.__name__, estimator.__class__.__name__, str((scores.mean(), scores.std())) ])) print()
def __regressor__(self, X_train, Y_train): self.ensemble = DecisionTreeRegressor(random_state=56) self.ensemble.fit(X_train, Y_train) print('Ensemble Model Ready')
("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") if "Auto" in datasets: build_auto( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
'elastic_net': { 'max_iter': [5, 10, 15], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio': np.arange(0.0, 1.0, 0.1) }, 'extra_trees': { "n_estimators": [80], 'max_depth': [ 30, ], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [0.01, 0.05, 0.10], 'min_samples_leaf': [0.005, 0.05, 0.10], }, 'bagging': { "base_estimator": [DecisionTreeRegressor(max_depth=8)], "n_estimators": [200], "max_features": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], }, 'sgd': { "alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0], "penalty": ["l1", "l2"], "loss": [ 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ] }, 'linear_svr': { "C": [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100], "loss": ['epsilon_insensitive', 'squared_epsilon_insensitive']
from sympy.core.numbers import RealNumber from sympy.functions.elementary.piecewise import Piecewise from sympy.core.symbol import Symbol import pandas from nose.tools import assert_almost_equal # Create some data m = 10000 X = np.random.normal(size=(m, 10)) thresh = np.random.normal(size=10) X_transformed = X * (X > thresh) beta = np.random.normal(size=10) y = np.dot(X_transformed, beta) + np.random.normal(size=m) # Train a decision tree regressor model = DecisionTreeRegressor() model.fit(X, y) print model.score(X, y) # Inspect def _sym_predict_decision_tree(model, names, current_node=0, output_idx=0, class_idx=0): left = model.tree_.children_left[current_node] right = model.tree_.children_right[current_node] if left == -1: assert right == -1 left_expr = RealNumber(model.tree_.value[current_node, output_idx,
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight( y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum( cl.predict(X) * rate for rate, cl in zip( self.learning_rates, self.classifiers)) self.loss_values.append( self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones( len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones( len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) shift = numpy.random.normal(size=[1, n_features]) * 5 trainX = numpy.dot(trainX.values, multiplier) + shift testX = numpy.dot(testX.values, multiplier) + shift boosters = { 'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3), 'fast+old_tree': CommonGradientBoosting(n_estimators=100, base_estimator=DecisionTreeRegressor( min_samples_split=50, max_depth=5)), 'fast+neuro': TreeGradientBoostingClassifier( n_estimators=100, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(), n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3) } for criterion in [ 'mse', # 'fmse', # 'pvalue', # 'significance', 'significance2', # 'gini', 'entropy', 'poisson' ]: boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier( n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor(criterion=criterion)) for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'random_forest': { 'clf__n_estimators': range(5, 200, 10), 'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'ada_boost': { 'clf__base_estimator': [DecisionTreeRegressor(max_depth=ii) for ii in range(10, 110, 10)], 'clf__n_estimators': range(50, 200, 10), 'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__loss': ['linear', 'square', 'exponential'], }, 'gradient_boost': { 'clf__loss': ['ls', 'lad', 'huber', 'quantile'], 'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__n_estimators': range(100, 350, 10), 'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'cat_boost': {
# load the data data = {} columns = [] loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns) all_features = deepcopy(columns) all_features.remove("target") all_features.remove("location") output = open(OUTPUT_DATA_FILE, 'w') output.write("location,observation,prediction\n") for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") model = DecisionTreeRegressor(max_depth=10, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) for i in range(0, len(testY)): output.write(str(location)) output.write(",") output.write(str(testY[i])) output.write(",") output.write(str(prediction[i])) output.write("\n") output.close()
# HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate), DecisionTreeRegressor( random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 ExtraTreeRegressor(random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 SVR() # C: 0.25, 0.5, 1, 5, 10 ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, # chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim, ICAP.icap, MRMR.mrmr,
data['Sex'] = label_encoder.transform(data['Sex']) enc = LabelEncoder() label_encoder = enc.fit(data[pd.notnull(data['Floor'])]['Floor'].values) transformed = label_encoder.transform(data[pd.notnull( data['Floor'])]['Floor'].values) indexes = pd.notnull(data.Floor) data.loc[indexes, 'Floor'] = transformed enc = LabelEncoder() label_encoder = enc.fit(data['Embarked']) data['Embarked'] = label_encoder.transform(data['Embarked']) ## predykcja wieku # TODO: zobaczyć predykcję również tylko po Title regresor = DecisionTreeRegressor() X_train_age = data[pd.notnull(data.Age)][['Title', 'SibSp', 'Parch']] y_train_age = data[pd.notnull(data.Age)][['Age']] regresor.fit(X_train_age, y_train_age) # TODO: sprawdzić tą predykcję wieku, działa chyba ok # data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None) data['Age'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), data['Age']) ##predykcja poziomu classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2) # X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']]
]) from sklearn.pipeline import FeatureUnion full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = full_pipeline.fit_transform(housing) housing_test_prepared = full_pipeline.fit_transform(housing_test) model_maps = dict() model_maps["Linear_Regression"] = LinearRegression() model_maps["Logistic_Regression"] = LogisticRegression(random_state=42, n_jobs=-1) model_maps["DecisionTreeRegressor"] = DecisionTreeRegressor(random_state=42) model_maps["RandomForestRegressor"] = RandomForestRegressor(random_state=42, n_jobs=-1) model_maps["SupportVectorRegressor"] = SVR(kernel="linear") results = pd.DataFrame(columns=["Hardware", "ExpID", "RMSETrainCF", "RMSETest", "MAPETrainCF", "MAPETest", "p-value", "TrainTime(s)", "TestTime(s)", "Experiment description"]) mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False) def trainStep(algo, indx, name): print("starting " + str(name) + " training") results.loc[indx] = ["Corei3/8GB", indx + 1, 0, 0, 0, 0, 0, 0, 0, "Training " + str(name)] start_time = time.time() algo.fit(housing_prepared, housing_labels) results.loc[indx, "TrainTime(s)"] = time.time() - start_time print("ends " + str(name) + " training")
'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(), 'EmpiricalCovariance':EmpiricalCovariance(), 'ExtraTreeClassifier':ExtraTreeClassifier(), 'ExtraTreeRegressor':ExtraTreeRegressor(), 'ExtraTreesClassifier':ExtraTreesClassifier(), 'ExtraTreesRegressor':ExtraTreesRegressor(), 'FactorAnalysis':FactorAnalysis(), 'FastICA':FastICA(), 'FeatureAgglomeration':FeatureAgglomeration(), 'FunctionTransformer':FunctionTransformer(), 'GMM':GMM(), 'GaussianMixture':GaussianMixture(), 'GaussianNB':GaussianNB(),
11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13 ] predictions = [] predictions2 = [] predictions3 = [] predictions4 = [] offset = int(0.7 * len(X)) for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred)
pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
@author: TF ''' import matplotlib.pyplot as plt import numpy as np from numpy import * from sklearn.tree.tree import DecisionTreeRegressor def plotfigure(X, X_test, y, yp): plt.figure() plt.scatter(X, y, c='k', label='data') plt.plot(X_test, yp, c='r', label='max_depth = 5', linewidth=2) plt.xlabel('data') plt.ylabel('target') plt.title('Decision Tree Regression') plt.legend() plt.show() x = np.linspace(-5, 5, 200) siny = np.sin(x) X = mat(x).T y = siny + np.random.rand(1, len(siny)) * 1.5 y = y.tolist()[0] clf = DecisionTreeRegressor(max_depth=3) clf.fit(X, y) X_test = np.arange(-5.0, 5.0, 0.05)[:, np.newaxis] yp = clf.predict(X_test) plotfigure(X, X_test, y, yp)
("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
store_pkl(auto_mapper, "Auto.pkl") auto_X = auto[:, 0:9] auto_y = auto[:, 9] print(auto_X.dtype, auto_y.dtype) def build_auto(regressor, name): regressor = regressor.fit(auto_X, auto_y) store_pkl(regressor, name + ".pkl") mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), "DecisionTreeAuto") build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc=None, X_csr=None): """Fit another stage of ``n_classes_`` trees to the boosting model. """ assert sample_mask.dtype == np.bool loss = self.loss_ original_y = y for k in range(loss.K): if loss.is_multi_class: y = np.array(original_y == k, dtype=np.float64) residual = loss.negative_gradient(y, y_pred, k=k, sample_weight=sample_weight) # induce regression tree on residuals tree = DecisionTreeRegressor( criterion=self.criterion, splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, presort=self.presort) if self.subsample < 1.0: # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) if X_csc is not None: tree.fit(X_csc, residual, sample_weight=sample_weight, check_input=False, X_idx_sorted=X_idx_sorted) else: tree.fit(X, residual, sample_weight=sample_weight, check_input=False, X_idx_sorted=X_idx_sorted) # update tree leaves if i == 0: if X_csr is not None: loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) else: loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) # add tree to ensemble self.estimators_[i, k] = tree return y_pred else: if X_csr is not None: loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) else: loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) # add tree to ensemble self.estimators_[i, k] = tree return y_pred
model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] lr2Rmse.append(rmse) print("\trmse: " + str(rmse)) lr2Data[location] = {} for i in range(0, len(testY)): timestamp = testTimestamp[i] value = prediction[i] lr2Data[location][timestamp] = value # dtr trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, allFeatures, "target", timestampData) print("\tDTR #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = DecisionTreeRegressor(max_leaf_nodes=15, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] dtrRmse.append(rmse) print("\trmse: " + str(rmse)) dtrData[location] = {} for i in range(0, len(testY)): timestamp = testTimestamp[i] value = prediction[i] dtrData[location][timestamp] = value trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, allFeatures, "target", timestampData) print("\tRFR #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9,