class ExtraTreesPreprocessorRegression(AutoSklearnPreprocessingAlgorithm): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", bootstrap=False, max_leaf_nodes=None, max_depth="None", min_weight_fraction_leaf=0.0, oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("mse", ): raise ValueError("'criterion' is not in ('mse', ): " "%s" % criterion) self.criterion = criterion if max_leaf_nodes_or_max_depth == "max_depth": self.max_leaf_nodes = None if max_depth == "None": self.max_depth = None else: self.max_depth = int(max_depth) # if use_max_depth == "True": # self.max_depth = int(max_depth) #elif use_max_depth == "False": # self.max_depth = None else: if max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.max_depth = None self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) if bootstrap == "True": self.bootstrap = True elif bootstrap == "False": self.bootstrap = False self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.preprocessor = None def fit(self, X, Y): from sklearn.ensemble import ExtraTreesRegressor num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.preprocessor = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) self.preprocessor.fit(X, Y) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ETR', 'name': 'Extra Trees Regressor Preprocessing', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) criterion = cs.add_hyperparameter(Constant("criterion", "mse")) max_features = cs.add_hyperparameter(UniformFloatHyperparameter( "max_features", 0.5, 5, default=1)) max_depth = cs.add_hyperparameter( UnParametrizedHyperparameter(name="max_depth", value="None")) min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_split", 2, 20, default=2)) min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_leaf", 1, 20, default=1)) min_weight_fraction_leaf = cs.add_hyperparameter(Constant( 'min_weight_fraction_leaf', 0.)) bootstrap = cs.add_hyperparameter(CategoricalHyperparameter( "bootstrap", ["True", "False"], default="False")) return cs
with open(out_filename+'_estimators_.txt','wt') as f: #f.write(xfr.estimators_) print >> f, xfr.estimators_ np.savetxt(out_filename+'_feature_importances_.txt',xfr.feature_importances_) print data_train.columns.shape,xfr.feature_importances_.shape with open(out_filename+'_fimp.txt','wt') as f: for feat,imp in zip(data_train.columns,xfr.feature_importances_): print >>f,"%s,%g"%(feat,imp) #with open(out_filename+'_feature_importances_.txt','wt') as f: #print >> f, xfr.feature_importances_ #with open('oob_score_.txt','wt') as f: #print >> f, xfr.oob_score_ #with open('oob_prediction_.txt','wt') as f: #print >> f, xfr.oob_prediction_ transformed_train = xfr.transform(data_train) transformed_test = xfr.transform(data_test) end = time.clock() print >> log, "time = ", end-start suffix = '_tr.csv' train_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+suffix) train = pd.DataFrame(transformed_train) train = pd.concat([data_train_in.ix[:,'target'],train],axis=1) train = pd.concat([data_train_in.ix[:,'id'],train],axis=1) train.to_csv(train_filename,index=0) test_filename = (os.path.splitext(os.path.basename(sys.argv[2]))[0]+suffix) test = pd.DataFrame(transformed_test) if 'target' in data_test_in: test = pd.concat([data_test_in.ix[:,'target'],test],axis=1)
class mixmodels: def __init__(self,nest=10): self.nest = nest def fit(self,data_train,target): self.target_train = target self.catcol = data_train.filter(like='var').columns.tolist() #start_gbr_tr = time.clock() self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr.fit(data_train,self.target_train) self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean") self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train) #end_gbr_tr = time.clock() #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr #start_xfr_tr = time.clock() self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr.fit(data_train,self.target_train) self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean") self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train) #end_xfr_tr = time.clock() #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr #start_gbr_cat = time.clock() self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_gbr_cat = time.clock() #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat #start_xfr_cat = time.clock() self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_xfr_cat = time.clock() #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat return self def predict(self,data_test): mix_test_list = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") mix_test_list += [pd.Series(self.gbr_tr_fit.predict(transformed_test_gbr))] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") mix_test_list += [pd.Series(self.xfr_tr_fit.predict(transformed_test_xfr))] mix_test_list += [pd.Series(self.gbr_cat_fit.predict(data_test[self.catcol]))] mix_test_list += [pd.Series(self.xfr_cat_fit.predict(data_test[self.catcol]))] mix_test = pd.concat(mix_test_list,1) mix_ave = mix_test.mean(1) mix_ave.name='target' return mix_ave def score(self,data_test,target_test): total_score = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") total_score += [ self.gbr_tr_fit.score(transformed_test_gbr,target_test) ] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") total_score += [ self.xfr_tr_fit.score(transformed_test_xfr,target_test) ] total_score += [ self.gbr_cat_fit.score(data_test[self.catcol],target_test) ] total_score += [ self.xfr_cat_fit.score(data_test[self.catcol],target_test) ] return sum(total_score)/float(len(total_score)) def gini(self,data_test,target_test): weight = data_test.var11 gns = [] transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean") gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_tr_fit.predict(transformed_test_gbr).tolist(),weight.tolist()) ] transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean") gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_tr_fit.predict(transformed_test_xfr).tolist(),weight.tolist()) ] gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ] gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ] return sum(gns)/float(len(gns))
with open(out_filename+'_estimators_.txt','wt') as f: #f.write(xfr.estimators_) print >> f, xfr.estimators_ np.savetxt(out_filename+'_feature_importances_.txt',xfr.feature_importances_) print data_train.columns.shape,xfr.feature_importances_.shape with open(out_filename+'_fimp.txt','wt') as f: for feat,imp in zip(data_train.columns,xfr.feature_importances_): print >>f,"%s,%g"%(feat,imp) #with open(out_filename+'_feature_importances_.txt','wt') as f: #print >> f, xfr.feature_importances_ #with open('oob_score_.txt','wt') as f: #print >> f, xfr.oob_score_ #with open('oob_prediction_.txt','wt') as f: #print >> f, xfr.oob_prediction_ transformed_train = xfr.transform(data_train,threshold="0.4*mean") transformed_test = xfr.transform(data_test,threshold="0.4*mean") end = time.clock() print >> log, "time = ", end-start suffix = '_tr.csv' train_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+suffix) train = pd.DataFrame(transformed_train) train = pd.concat([data_train_in.ix[:,'target'],train],axis=1) train = pd.concat([data_train_in.ix[:,'id'],train],axis=1) train.to_csv(train_filename,index=0) test_filename = (os.path.splitext(os.path.basename(sys.argv[2]))[0]+suffix) test = pd.DataFrame(transformed_test) if 'target' in data_test_in: test = pd.concat([data_test_in.ix[:,'target'],test],axis=1)
#pca = PCA(n_components=n_components) #X = pca.fit_transform(X) print(X.shape) print(type(X)) print(y.shape) print(type(y)) print(y.shape) estimator = Ridge() #selector = RFECV(estimator, step=1, cv=5) selector = ExtraTreesRegressor(n_estimators=50) selector = selector.fit(X, y) print("Optimal number of features : %d" % selector.n_features_) X = selector.transform(X) print(X.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # build a classifier clf = RandomForestRegressor(n_estimators=20) # use a full grid over all parameters param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10],