def extratrees(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame, parameters: Dict): n_splits = 5 # n_neighbors = parameters["n_neighbors"] folds = KFold(n_splits=n_splits, shuffle=True, random_state=42) oof = np.zeros((df.shape[0] + test.shape[0], 9)) for trn_idx, val_idx in folds.split(df, target): train_x = df.iloc[trn_idx, :].values val_x = df.iloc[val_idx, :].values train_y = target[trn_idx].values val_y = target[val_idx].values classifier = ExtraTreesClassifier(n_jobs=14, n_estimators=100, max_depth=12) classifier.fit(train_x, train_y) y_hat = classifier.predict_proba(val_x) print(log_loss(val_y, y_hat)) print(oof.shape, y_hat.shape) oof[val_idx] = y_hat pred = classifier.predict_proba(test.values) oof[len(target):, :] += pred / n_splits print(oof.shape) # np.save("data/04_features/oof.npz", oof) # oof = np.load("data/04_features/oof.npy") n_name = ["knn_{}".format(i) for i in range(9)] oof = pd.DataFrame(oof) oof.to_csv("data/09_oof/extra_{}.csv".format("n3")) return oof[len(target):].values
def train_ensemble(train_X, train_y, test_X): def to_tfidf(X): X = X.astype(np.float32) tfidf = TfidfTransformer() X = tfidf.fit_transform(X).toarray() return X train_set_X = train_X.copy() test_set_X = test_X.copy() train_set_X = to_tfidf(train_set_X) test_set_X = to_tfidf(test_X) model = ExtraTreesClassifier(n_estimators=300, criterion="entropy", max_features=30, max_depth=25) model.fit(train_set_X, train_y) pred_y = model.predict_proba(test_set_X) save_prediction("./test/extratrees.stack.csv", pred_y) pred_y = model.predict_proba(train_set_X) save_prediction("./train/extratrees.stack.csv", pred_y)
def runET(train_X, train_y, test_X, test_y=None, test_X2=None, rounds=100, depth=20, leaf=10, feat=0.2,min_data_split_val=2,seed_val=0,job = -1): model = ExtraTreesClassifier( n_estimators = rounds, max_depth = depth, min_samples_split = min_data_split_val, min_samples_leaf = leaf, max_features = feat, n_jobs = job, random_state = seed_val) model.fit(train_X, train_y) train_preds = model.predict_proba(train_X)[:,1] test_preds = model.predict_proba(test_X)[:,1] test_preds2 = 0 if test_X2 is not None: test_preds2 = model.predict_proba(test_X2)[:,1] test_loss = 0 if test_y is not None: train_loss = metrics.roc_auc_score(train_y, train_preds) test_loss = metrics.roc_auc_score(test_y, test_preds) print("Depth, leaf, feat : ", depth, leaf, feat) print("Train and Test loss : ", train_loss, test_loss) return test_preds, test_loss, test_preds2, model
def try_params(n_iterations, params): n_estimators = int(round(n_iterations * trees_per_iteration)) print "n_estimators:", n_estimators pprint(params) clf = XT(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params) clf.fit(x_train, y_train) p = clf.predict_proba(x_train)[:, 1] ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) # p = clf.predict_proba(x_test)[:, 1] ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) return {'loss': ll, 'log_loss': ll, 'auc': auc}
def predict_et(): X = pd.read_csv('data/X_train.csv', header=0) y = pd.read_csv('data/y_train.csv', header=0) #X= X.drop(['id'],axis=1) #X= X.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1) y = y['fault_severity'] testX = pd.read_csv('data/X_test.csv', header=0) testY = pd.read_csv('data/y_test.csv', header=0) testX1 = testX #testX1= testX.drop(['id'],axis=1) #testX1=testX.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1) testY = testY['fault_severity'] et = ExtraTreesClassifier(n_estimators=440, random_state=1) et.fit(X, y) print(et.score(X, y)) print(et.score(testX1, testY)) # prediction testy = et.predict_proba(testX1) pred_cols = ['predict_{}'.format(i) for i in range(3)] submission = pd.DataFrame(et.predict_proba(testX1), index=testX.id, columns=pred_cols) print(multiclass_log_loss(testY.values, submission.values)) submission.to_csv('et_output.csv', index_label='id')
def stack_extra_trees_layer2(features, labels, test_feature): features = np.load(features) test = np.load(test_feature) fold_split, feature_split, label_split = stack_split(features, labels, 5) fold_score = [] test_score = [] print("\nInitiate stack extra_trees") for i in range(len(fold_split)): print("\nProcessing random forest model number:{}".format(i + 1)) extra_trees = ExtraTreesClassifier(n_estimators=450, max_depth=4, criterion='entropy') extra_trees.fit(feature_split["feature_{}".format(i + 1)], label_split["label_{}".format(i + 1)]) print("Training complete") stack_score = extra_trees.predict_proba( fold_split["fold_{}".format(i + 1)]) print("fold score predicted") test_prediction = extra_trees.predict_proba(test_feature) print("test score predicted") test_score.append(test_prediction[:, 1].tolist()) fold_score += stack_score[:, 1].tolist() joblib.dump(extra_trees, model_path + "ET_layer_2_model_{}.pkl".format(i + 1)) print("ET model nubmer:{}".format(i + 1) + " complete") # print(scores) return fold_score, test_score
def objective( self, args ): args_ = self.input_converter( args ) print(args_) try: CLASSIFIER = ExtraTreesClassifier( random_state = 42, **args_ ) except: CLASSIFIER = ExtraTreesClassifier( **args_ ) CLASSIFIER.fit( self.X_train, self.y_train ) nt_preds = CLASSIFIER.predict( self.X_test ) nt_score = f1_score( self.y_test, nt_preds ) pred_ = CLASSIFIER.predict_proba( self.X_val )[ :, 1 ] best, test_preds = self.find_best_threshold( pred_ ) preds = CLASSIFIER.predict_proba( self.X_test )[ :, 1 ] test_preds = list() for k in range( len( preds ) ): if preds[ k ] > best: test_preds.append( 1 ) else: test_preds.append( 0 ) score = f1_score( self.y_test, test_preds ) self.improved.append( ( nt_score, score, best ) ) print( '\n ============================ \n {} \n ============================ \n'.format( nt_score ) ) print( '\n ============================ \n {} \n ============================ \n'.format( score ) ) print( '\n ============================ \n {} \n ============================ \n'.format( best ) ) print( '\n {} \n'.format( args_ ) ) cm = np.array( confusion_matrix( self.y_test, test_preds ) ) plot_confusion_matrix( cm = cm, target_names = [ 'nothing', 'spike' ] ) return {'loss': -score, 'status': STATUS_OK}
class MyExtraTree(MyClassifier): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesClassifier(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesClassifier(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) # def predict(self, Xtest, option = None): # return self._extree.predict(Xtest) def predict_proba(self, Xtest, option = None): return self._extree.predict_proba(Xtest)[:, 1] def predict_proba_multi(self, Xtest, option = None): return self._extree.predict_proba(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Extreme Random Forest Classifier***************") t0 = time() clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Extreme Random Forest Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def eval_param(params): """Evaluation of one set of xgboost's params. Then, use 3 folds as training and cv in a row as xgboost's watchlist with an early_stop at 50. """ global df_results, train, target, test print ("Training with params : ") print (params) random_state = 42 avg_score = 0. n_folds = 3 predict = np.zeros(test.shape[0]) #dtest = xgb.DMatrix(test) skf = StratifiedKFold(target, n_folds=n_folds, random_state=random_state) for train_index, cv_index in skf: # train x_train, x_cv = train[train_index], train[cv_index] y_train, y_cv = target[train_index], target[cv_index] clf = ExtraTreesClassifier(**params).fit(x_train, y_train) #bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds, maximize=True) # test / score predict_cv = clf.predict_proba(x_cv, y_cv)#bst.predict(dvalid, ntree_limit=bst.best_iteration) avg_score += -log_loss(y_cv, predict_cv) predict += clf.predict_proba(test)#bst.predict(dtest, ntree_limit=bst.best_iteration) predict /= n_folds avg_score /= n_folds # store new_row = pd.DataFrame([np.append([avg_score], list(params.values()))], columns=np.append(['score'], list(params.keys()))) df_results = df_results.append(new_row, ignore_index=True) np.savetxt('hyperopt_preds/pred' + str(df_results.index.max()) + '.txt', predict, fmt='%s') df_results.to_csv('hyperopt_results_sgd.csv') print ("\tScore {0}\n\n".format(avg_score)) return {'loss': - avg_score, 'status': STATUS_OK}
def extra_forest(train_data, var_count, y, validate, test_data): extf_model = ExtraTreesClassifier(n_estimators=350, max_depth=10, min_samples_leaf=10, random_state=1234, max_features=0.75) extf_model.fit(train_data, np.ravel(y)) valid_pred = extf_model.predict_proba(validate) test_pred = extf_model.predict_proba(test_data) return valid_pred, test_pred
def et(train_data,train_label,val_data,val_label,test_data,name="extratrees_submission.csv"): print "start training ExtraTrees..." etClf = ExtraTreesClassifier(n_estimators=10) etClf.fit(train_data,train_label) #evaluate on validation set val_pred_label = etClf.predict_proba(val_data) logloss = preprocess.evaluation(val_label,val_pred_label) print "logloss of validation set:",logloss print "Start classify test set..." test_label = etClf.predict_proba(test_data) preprocess.saveResult(test_label,filename = name)
def et(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f") pred_cols = ['predict_{}'.format(i) for i in range(3)] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) pred.iloc[itest, :] = clf.predict_proba(Xte) trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr)) _run.info['trainloss'].append(trainloss) loss = multiclass_log_loss(yte, pred.iloc[itest].values) _run.info['loss'].append(loss) if i == 1: feature_importances_ = clf.feature_importances_ / n_folds else: feature_importances_ += clf.feature_importances_ / n_folds i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) _run.info['feature_importances'] = list(feature_importances_) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) predtest = pd.DataFrame(clf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def et(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f") pred_cols = ['predict_{}'.format(i) for i in range(3)] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) pred.iloc[itest, :] = clf.predict_proba(Xte) trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr)) _run.info['trainloss'].append(trainloss) loss = multiclass_log_loss(yte, pred.iloc[itest].values) _run.info['loss'].append(loss) if i == 1: feature_importances_ = clf.feature_importances_/n_folds else: feature_importances_ += clf.feature_importances_/n_folds i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) _run.info['feature_importances'] = list(feature_importances_) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) predtest = pd.DataFrame(clf.predict_proba(Xte), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def process_sylvine(Xtrain, ytrain, Xval, Xtest, global_params): print 'ITS A SYLVINE TIME' print t0 = time.time() goods = np.array([ False, False, False, False, False, False, True, False, True, True, False, False, False, False, True, True, False, False, False, True ]) Xnewtrain = np.array(Xtrain[:, goods]) Xnewtest = np.array(Xtest[:, goods]) Xnewval = np.array(Xval[:, goods]) t0 = time.time() iso = Isomap(n_neighbors=20, n_components=3).fit(Xnewtrain[:, :6]) print 'ISOSTAS !!!' print(time.time() - t0) / 60. t0 = time.time() Xisotrain = iso.transform(Xnewtrain[:, :6]) Xisotest = iso.transform(Xnewtest[:, :6]) Xisoval = iso.transform(Xnewval[:, :6]) print 'ISOSTAS RETURNED !!!' print(time.time() - t0) / 60. Xnewtrain = np.hstack((Xnewtrain, Xisotrain)) Xnewtest = np.hstack((Xnewtest, Xisotest)) Xnewval = np.hstack((Xnewval, Xisoval)) modelrf = ExtraTreesClassifier(n_estimators=10000, n_jobs=global_params['n_jobs']) modelrf.fit(Xnewtrain, ytrain) print(time.time() - t0) / 60. ytestrf = modelrf.predict_proba(Xnewtest)[:, 1] yvalrf = modelrf.predict_proba(Xnewval)[:, 1] ytestfinal = np.round(ytestrf) yvalfinal = np.round(yvalrf) return yvalfinal, ytestfinal
def predictSingle(): train = pd.read_csv('newTrain.csv') train = train.drop(['AnimalID'], axis=1) label = train['OutcomeType'] train = train.drop(['OutcomeType'], axis=1) test = pd.read_csv('newTest.csv') id = test.ID test = test.drop(['ID'], axis=1) et = ExtraTreesClassifier(n_estimators=800, max_features='sqrt', max_depth=10, min_samples_leaf=2, random_state=seed) et.fit(train, label) plotFeatureImportance(et, train) train, test = removeUnimporantFeat(train, test, et) et.fit(train, label) #cross-validation for rf kfold = KFold(n_splits=10, random_state=seed) score = cross_val_score(et, train, label, scoring='neg_log_loss', cv=kfold) print(-score.mean()) columns = et.classes_ predictions = et.predict_proba(test) output_et = pd.DataFrame(predictions, columns=columns) output_et = pd.concat([id, output_et], axis=1) output_et.to_csv('output_et.csv', index=False)
def _cascade_layer(self, X, y=None, layer=0): n_tree = getattr(self, 'n_cascadeRFtree') n_cascadeRF = getattr(self, 'n_cascadeRF') min_samples = getattr(self, 'min_samples_cascade') prf = RandomForestClassifier( n_estimators=100, max_features=8, bootstrap=True, criterion="entropy", min_samples_split=20, max_depth=None, class_weight='balanced', oob_score=True) crf = ExtraTreesClassifier( n_estimators=100, max_depth=None, bootstrap=True, oob_score=True) prf_pred = [] if y is not None: # print('Adding/Training Layer, n_layer={}'.format(self.n_layer)) for irf in range(n_cascadeRF): prf.fit(X, y) crf.fit(X, y) setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf) setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf) probas = prf.oob_decision_function_ probas += crf.oob_decision_function_ prf_pred.append(probas) elif y is None: for irf in range(n_cascadeRF): prf = getattr(self, '_casprf{}_{}'.format(layer, irf)) crf = getattr(self, '_cascrf{}_{}'.format(layer, irf)) probas = prf.predict_proba(X) probas += crf.predict_proba(X) prf_pred.append(probas) return prf_pred
class ExtraTreesClassifierMetaPrim(primitive): def __init__(self, random_state=0): super(ExtraTreesClassifierMetaPrim, self).__init__(name='ExtraTreesMetaClassifier') self.id = 61 self.hyperparams = [] self.type = 'ensemble' self.description = "An extra-trees classifier. This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = ExtraTreesClassifier(random_state=random_state, n_jobs=5) self.accept_type = 'c' def can_accept(self, data): return self.can_accept_c(data, 'Classification') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['proba_predictions'] = self.model.predict_proba(output['X']) classes = list(self.model.classes_) cols = ["{}_{}Pred".format(c, self.name) for c in classes] output['X'] = pd.DataFrame(output['proba_predictions'], columns=cols) output['proba_predictions'] = pd.DataFrame(output['proba_predictions'], columns=classes) output['Y'] = output['Y'] final_output = {0: output} return final_output
class ExtraTreeModel(BaseModel): def __init__(self, model_params): super(BaseModel, self).__init__() self.model = ExtraTreesClassifier(**model_params) def fit(self, data, dep_var_name=None): if dep_var_name is None: sys.exit('dep_var_name is needed for fit function.') else: self.dep_var_name = dep_var_name tmp_data = data.copy() data_label = tmp_data[self.dep_var_name].values tmp_data.drop(self.dep_var_name, axis=1, inplace=True) self.model.fit(tmp_data, data_label) def predict(self, data): if self.dep_var_name in data.columns: tmp_data = data.copy() tmp_data.drop(self.dep_var_name, axis=1, inplace=True) else: tmp_data = data scores = self.model.predict_proba(tmp_data) ## scores is a numpy array without index result = pd.Series(scores[:, 1], index=tmp_data.index) return result
def ExtraTree_prediction(feature_data, result_data): n_splits = 5 kf = StratifiedKFold( n_splits=n_splits) # 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同,需要目标数据 all_pred = np.zeros(feature_data.shape[0]) all_proba = np.zeros(feature_data.shape[0]) for train_index, test_index in kf.split(feature_data, result_data): feature_train, feature_test, result_train, result_test= \ feature_data[train_index], feature_data[test_index], result_data[train_index], result_data[test_index] class_weight = {0: 1, 1: 1} clf = ExtraTreesClassifier(random_state=random_state, class_weight=class_weight) clf.fit(feature_train, result_train.ravel()) test_pred = clf.predict(feature_test) test_proba = clf.predict_proba(feature_test) all_pred[test_index] = test_pred all_proba[test_index] = test_proba[:, 1] confmat = confusion_matrix(result_data, all_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print('1. The acc score of the model {}\n'.format( accuracy_score(result_data, all_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(result_data, all_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(result_data, all_proba, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(result_data, all_pred, average='macro')))
def train_classifier(prefix='atx', nside=32, ds=4, color_thresh=30, test_size=0.5): X_img,y=load_labeled(prefix=prefix,nside=nside,quick=False) if prefix=='atx': color_name='pool' colors = get_colors(name=color_name, quick=True) print '...getting features...' X = get_features(X_img, colors, ds=ds, thresh=color_thresh) print '...done getting features...' from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.cross_validation import train_test_split from sklearn import metrics rf = ExtraTreesClassifier(n_estimators=200, n_jobs=6, max_features=0.02) X_train, X_test, y_train, y_test, img_train, img_test = train_test_split(X,y,X_img,test_size=0.5) print '...fitting...' rf.fit(X_train, y_train) y_proba = rf.predict_proba(X_test)[:,1] fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba) auc = metrics.auc(fpr, tpr) pl.clf(); pl.plot(fpr, tpr, 'b-o') pl.plot(fpr, fpr/np.mean(y), 'r--'); pl.ylim(0,1); pl.xlim(0,1) pl.title('AUC: %0.3f'%auc) for i,th in enumerate(thresholds): print th,tpr[i],tpr[i]/fpr[i] prob_thresh=0.6 wh_missed=np.where((y_proba<prob_thresh)&(y_test==1))[0] wh_ok=np.where((y_proba>prob_thresh)&(y_test==1))[0]
def load_train_data(r_seed): X_train, X_valid, Y_train, Y_valid = train_test_split( train[train_features], train['Response'], test_size=0.8, random_state=r_seed) rbm1 = ExtraTreesClassifier(n_estimators=500, max_features=0.4, n_jobs=32, random_state=jj, verbose=1).fit(X_train, Y_train) rbm2 = RandomForestClassifier(n_estimators=300, max_features=0.28, n_jobs=32, verbose=1, random_state=jj).fit(X_train, Y_train) rbm3 = GradientBoostingClassifier(n_estimators=48, max_depth=11, subsample=0.8, min_samples_leaf=5, verbose=1, random_state=jj).fit(X_train, Y_train) res_mean = rbm1.predict_proba(X_valid) + rbm2.predict_proba( X_valid) + rbm3.predict_proba(X_valid) res_mean = res_mean / 3.0 feats = ['new_feat_%d' % (i) for i in range(1, 9)] new_data = pd.DataFrame(res_mean, columns=feats) new_data.index = X_valid.index all_data = pd.concat([X_valid, new_data], axis=1) print all_data.shape return all_data, Y_valid, rbm1, rbm2, rbm3
def learn(x, y, test_x): cw = { "0": variables.weight_0_rf, "1000": variables.weight_1000_rf, "1500": variables.weight_1500_rf, "2000": variables.weight_2000_rf } clf = ExtraTreesClassifier( n_jobs=-1, n_estimators=variables.n_estimators_et, max_depth=variables.max_depth_et, random_state=0, min_samples_split=variables.min_samples_split_et, min_samples_leaf=variables.min_samples_leaf_et, max_features=variables.max_feature_et, max_leaf_nodes=variables.max_leaf_nodes_et, criterion=variables.criterion_et, min_impurity_split=variables.min_impurity_split_et, class_weight=variables.cw_et).fit(x, y) print "n_estimators=", variables.n_estimators_et, print "max_depth=", variables.max_depth_et, print "min_samples_split=", variables.min_samples_split_et, print "min_samples_leaf=", variables.min_samples_leaf_et, print "max_features=", variables.max_feature_et, print "max_leaf_nodes=", variables.max_leaf_nodes_et, print "criterion=", variables.criterion_et, print "min_impurity_split=", variables.min_impurity_split_et, print "class_weight=", variables.cw_et prediction_list = clf.predict(test_x) prediction_list_prob = clf.predict_proba(test_x) return prediction_list, prediction_list_prob
def movement_interval(train_on=['training1','training2', 'training3', 'training4'], predict_on=['validation1_lab', 'validation2_lab', 'validation3_lab']): window_shift = 5 window_length = 40 print 'aggregated_skeletion_win' X_win = aggregated_skeletion_win(predict_on, agg_functions=['median', 'var', 'min', 'max'], window_shift=window_shift, window_length=window_length) X_win= X_win.fillna(0) print 'train rf model' X, y = aggregated_skeletion(file_names=train_on, agg_functions=['median', 'var', 'min', 'max']) X = X.fillna(0) y = np.array([gesture_to_id[gest] for gest in y]) clf = ExtraTreesClassifier(n_estimators=1500, random_state=0, n_jobs=-1) clf.fit(X, y) del X del y print 'rf predict' y_pred = clf.predict_proba(X_win) df_out = pd.concat([DataFrame.from_records(X_win.index.values.tolist(), columns=['sample_id', 'frame']), DataFrame(y_pred)], axis=1) df_out['movement'] = np.array(np.argmax(y_pred, axis=1) != 0, dtype=int) # adjust for sliding window size df_out.frame = df_out.frame + 20 return df_out
def test_multioutput(): """Check estimators on multi-output problems.""" olderr = np.seterr(divide="ignore") X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]] y = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3]] T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] # toy classification problem clf = ExtraTreesClassifier(random_state=0) y_hat = clf.fit(X, y).predict(T) assert_array_equal(y_hat, y_true) assert_equal(y_hat.shape, (4, 2)) proba = clf.predict_proba(T) assert_equal(len(proba), 2) assert_equal(proba[0].shape, (4, 2)) assert_equal(proba[1].shape, (4, 4)) log_proba = clf.predict_log_proba(T) assert_equal(len(log_proba), 2) assert_equal(log_proba[0].shape, (4, 2)) assert_equal(log_proba[1].shape, (4, 4)) # toy regression problem clf = ExtraTreesRegressor(random_state=5) y_hat = clf.fit(X, y).predict(T) assert_almost_equal(y_hat, y_true) assert_equal(y_hat.shape, (4, 2)) np.seterr(**olderr)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf,feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def test_multioutput(): """Check estimators on multi-output problems.""" X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]] y = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3]] T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] # toy classification problem clf = ExtraTreesClassifier(random_state=0) y_hat = clf.fit(X, y).predict(T) assert_array_equal(y_hat, y_true) assert_equal(y_hat.shape, (4, 2)) proba = clf.predict_proba(T) assert_equal(len(proba), 2) assert_equal(proba[0].shape, (4, 2)) assert_equal(proba[1].shape, (4, 4)) log_proba = clf.predict_log_proba(T) assert_equal(len(log_proba), 2) assert_equal(log_proba[0].shape, (4, 2)) assert_equal(log_proba[1].shape, (4, 4)) # toy regression problem clf = ExtraTreesRegressor(random_state=5) y_hat = clf.fit(X, y).predict(T) assert_almost_equal(y_hat, y_true) assert_equal(y_hat.shape, (4, 2))
def bagged_set(X_ts, y_cs, seed, estimators, xt, yt=None): # create array object to hold predictions baggedpred = np.array([0.0 for d in range(0, xt.shape[0])]) #loop for as many times as we want bags for n in range(0, estimators): model = ExtraTreesClassifier(n_estimators=1000, criterion="entropy", max_depth=12, min_samples_leaf=4, max_features=0.5, n_jobs=20) model.fit(X_ts, y_cs) preds = model.predict_proba(xt)[:, 1] # update bag's array baggedpred += preds print("completed: " + str(n)) # divide with number of bags to create an average estimate baggedpred /= estimators return baggedpred
def eval_seq_model(out_file='eval_model.csv',window_shift=1, retrain=False): filename = 'cache/joblib/rf_eval_model.joblib.pkl' file_names=['training1', 'training3', 'training4', 'validation1_lab', 'validation3_lab'] if retrain: X, y = aggregated_skeletion(file_names=file_names, agg_functions=['median', 'var', 'min', 'max']) X = X.fillna(0) y = np.array([gesture_to_id[gest] for gest in y]) clf = ExtraTreesClassifier(n_estimators=500, random_state=0, n_jobs=-1) clf.fit(X, y) _ = joblib.dump(clf, filename, compress=9) else: clf = joblib.load(filename) X_win = aggregated_skeletion_win(['validation2_lab', 'training2'], agg_functions=['median', 'var', 'min', 'max'], window_shift=window_shift) y_pred = clf.predict_proba(X_win) df_pred = DataFrame(y_pred, index=[s for (s, _) in X_win.index]) to_dump = df_pred.groupby(level=0).apply(postprocess) dump_predictions(to_dump, out_path=out_file) return df_pred, to_dump
def extra_tree(): train_features, test_features = load_features() train_features = train_features.fillna(value=0) test_features = test_features.fillna(value=0) X_train = train_features.drop(["bidder_id", "outcome"], axis=1) Y_train = train_features["outcome"] X_test = test_features.drop(["bidder_id"], axis=1) print("Training extra_tree model") extraTree = ExtraTreesClassifier(n_estimators=3000, max_features=10) print("Model trained") print("Cross validation score (extra_tree) : ") cv_score = np.mean( cross_val_score(extraTree, X_train, Y_train, cv=5, scoring='roc_auc')) print(cv_score) print("Generating submission file") extraTree.fit(X_train, Y_train) prediction = extraTree.predict_proba(X_test) test_features['prediction'] = prediction[:, 1] test_features[['bidder_id', 'prediction']].to_csv('data/submission_extra_tree.csv', index=False) print("Output file successfully created") print("Generating auc curve and auc score") auc = roc_auc(train_features, extraTree) print("AUC score : " + str(auc))
def objective_etree(space): numfolds = 10 total = 0 kf2 = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=13) etree = ExtraTreesClassifier(n_estimators = space['n_estimators'], max_depth = space['max_depth'], max_features = space['max_features'], criterion = space['criterion'], min_impurity_split = space['min_impurity_split'], # scale = space['scale'], # normalize = space['normalize'], # min_samples_leaf = space['min_samples_leaf'], # min_weight_fraction_leaf = space['min_weight_fraction_leaf'], # min_impurity_split = space['min_impurity_split'], random_state = 13, warm_start = True, n_jobs = -1 ) for train_index, test_index in kf2.split(X_train_cl,y_train_cl.IS_IT_GAMER): xtrain, xtest = X_train_cl.iloc[train_index], X_train_cl.iloc[test_index] ytrain, ytest = y_train_cl.iloc[train_index], y_train_cl.iloc[test_index] # eval_set = [(xtrain, ytrain),(xtest, ytest)] etree.fit(xtrain, ytrain.values.ravel()) pred = etree.predict_proba(xtest)[:,1] logloss = log_loss(ytest, pred) # print ("SCORE:", logloss) total += logloss total = total/numfolds print (total) return{'loss':total, 'status': STATUS_OK }
def real_submodel(tag, by): print "Classify submodel_by_{}_{} ...".format(by, tag) X, y, X_test = load_n_clean_data(tag, by, load=False, cv=False) print "Build model ..." # clf = AdaBoostClassifier(ExtraTreesClassifier(n_jobs=-1, # n_estimators=100, # min_samples_leaf=9, # max_depth=20, # verbose=4), n_estimators=10) clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=200, min_samples_leaf=9, max_depth=30, verbose=4) clf.fit(X, y) pred = clf.predict_proba(X_test) print pred print "Dump precious stuff in case of crash ..." import pickle # with open('output/submodels_by_{}/pred_{}.cache'.format(by, tag), 'w') as fout_pred: # pickle.dump(pred, fout_pred) # clf occupy too much space of disk # with open('output/submodels_by_{}/clf_{}.cache'.format(by, tag), 'w') as fout_clf: # pickle.dump(clf, fout_clf) label1_idx = clf.classes_.tolist().index(1) X_test['Predicted'] = [item[label1_idx] for item in pred] return X_test['Predicted']
def eval_gesture_model(retrain=False, window_shift=1, window_length=40, train_on=['training1', 'training3', 'training4', 'validation1_lab', 'validation3_lab'], predict_on=['validation2_lab', 'training2']): filename = 'cache/joblib/rf_eval_model' + str(window_length) + '.joblib.pkl' #file_names=['training1', 'training3', 'training4', # 'validation1_lab', 'validation3_lab'] if retrain: X, y = aggregated_skeletion(file_names=train_on, agg_functions=['median', 'var', 'min', 'max'], window_length=window_length) X = X.fillna(0) y = np.array([gesture_to_id[gest] for gest in y]) clf = ExtraTreesClassifier(n_estimators=500, random_state=0, n_jobs=-1) clf.fit(X, y) _ = joblib.dump(clf, filename, compress=9) else: clf = joblib.load(filename) X_test, y_test = aggregated_skeletion(predict_on, agg_functions=['median', 'var', 'min', 'max'], window_length=window_length) X_test = X_test.fillna(0) y_test = np.array([gesture_to_id[gest] for gest in y_test]) y_pred = clf.predict_proba(X_test) return y_pred, y_test
class ETClassifier(BaseClassifier): def __init__(self, opt): super().__init__(opt) self.clf_name = 'ETClassifier' self.clf = ExtraTreesClassifier( n_estimators=opt.get('n_estimators', 200), max_depth=opt.get('max_depth', 7), min_samples_leaf=opt.get('min_samples_leaf', 10), max_leaf_nodes=opt.get('max_leaf_nodes', 63), min_samples_split=opt.get('min_samples_split', 2), bootstrap=opt.get('bootstrap', True), class_weight=opt.get('class_weight', { 0: 1, 1: 10 }), random_state=opt.get('random_state', 18520), n_jobs=opt.get('n_jobs', 2)) def fit(self, train_set, valid_set=None): self.clf.fit(train_set[0], train_set[1]) def predict_proba(self, x): return self.clf.predict_proba(x)[:, 1] def get_feat_imp(self): return self.clf.feature_importances_
def ef_predictedValue(): print '----------ExtraForest----------' ef_clf = ExtraTreesClassifier(n_estimators = NoOfEstimators, n_jobs = NoJobs) ef_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) ef_predictedValue = ef_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % ef_clf.feature_importances_ return ef_predictedValue[:,1]
def extratrees(): train,test,Y,device_id = None,None,None,None print('Load the featured Train/Test data..') with open('../cache/sparse_train_xgb.p', 'rb') as f: train = pickle.load(f) with open('../cache/sparse_test_xgb.p', 'rb') as f: test = pickle.load(f) with open('../cache/y.p', 'rb') as f: Y = pickle.load(f) with open('../cache/device.p', 'rb') as f: device_id = pickle.load(f) # Group Labels lable_group = LabelEncoder() Y = lable_group.fit_transform(Y) X_train, X_val, y_train, y_val = train_test_split(train, Y, test_size=.30) ################## # ExtraTrees ################## model = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0, criterion='entropy', n_jobs=32, verbose=20) model.fit(X_train, y_train) x_val_prob = model.predict_proba(X_val) score = log_loss(y_val.tolist(), x_val_prob) print("ExtraTrees - Score : " + str(score))
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) #y_pred/=m; clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3, max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def calc_prob(df_features_driver, df_features_other): df_train = df_features_driver.append(df_features_other) df_train.reset_index(inplace = True) df_train.Driver = df_train.Driver.astype(int) # So far, the best result was achieved by using a RandomForestClassifier with Bagging # model = BaggingClassifier(base_estimator = ExtraTreesClassifier()) # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1)) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = AdaBoostClassifier()) #model = RandomForestClassifier(200) # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()]) # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()), # GradientBoostingClassifier]) #model = GradientBoostingClassifier(n_estimators = 10000) model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True) # model = ExtraTreesClassifier(500, criterion='entropy') feature_columns = df_train.iloc[:, 4:] # Train the classifier model.fit(feature_columns, df_train.Driver) df_submission = pd.DataFrame() df_submission['driver_trip'] = create_first_column(df_features_driver) probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver probs_df = pd.DataFrame(probs_array) df_submission['prob'] = np.array(probs_df.iloc[:, 1]) return df_submission
def plot_confusion_matrix(model, relevant_features_new, y_new, threshold_classification): extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0) base_classification = Base_Classification(model, extra_trees) #sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0) sss = StratifiedKFold(n_splits=3, shuffle=False, random_state=10) for train_index, test_index in sss.split(relevant_features_new, y_new): x_train, x_test = relevant_features_new.iloc[ train_index, :], relevant_features_new.iloc[test_index, :] y_train, y_test = y_new.iloc[train_index, :], y_new.iloc[test_index, :] break #x_train, x_test, y_train, y_test = train_test_split(relevant_features_new, y_new, test_size=0.3, random_state=42) extra_trees.fit(x_train, y_train) pred = extra_trees.predict_proba(x_test) pred = pd.DataFrame(pred, columns=extra_trees.classes_) valid_indexes = base_classification.get_accuracy.get_indexes_with_valid_predictions( pred, threshold_classification) x_test_valid = x_test.iloc[valid_indexes, :] y_test_valid = y_test.iloc[valid_indexes, :] base_classification.get_accuracy.plot_confusion_matrix( x_test_valid, y_test_valid, extra_trees) print("Accuracy => {}".format(extra_trees.score(x_test_valid, y_test_valid))) base_classification.get_accuracy.plot_confusion_matrix( x_test, y_test, extra_trees) print("Accuracy => {}".format(extra_trees.score(x_test, y_test)))
def eval_seq_model(out_file='eval_model.csv', window_shift=1, retrain=False): filename = 'cache/joblib/rf_eval_model.joblib.pkl' file_names = [ 'training1', 'training3', 'training4', 'validation1_lab', 'validation3_lab' ] if retrain: X, y = aggregated_skeletion( file_names=file_names, agg_functions=['median', 'var', 'min', 'max']) X = X.fillna(0) y = np.array([gesture_to_id[gest] for gest in y]) clf = ExtraTreesClassifier(n_estimators=500, random_state=0, n_jobs=-1) clf.fit(X, y) _ = joblib.dump(clf, filename, compress=9) else: clf = joblib.load(filename) X_win = aggregated_skeletion_win( ['validation2_lab', 'training2'], agg_functions=['median', 'var', 'min', 'max'], window_shift=window_shift) y_pred = clf.predict_proba(X_win) df_pred = DataFrame(y_pred, index=[s for (s, _) in X_win.index]) to_dump = df_pred.groupby(level=0).apply(postprocess) dump_predictions(to_dump, out_path=out_file) return df_pred, to_dump
def ef_predictedValue(): print '----------ExtraForest----------' ef_clf = ExtraTreesClassifier(n_estimators=NoOfEstimators, n_jobs=NoJobs) ef_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) ef_predictedValue = ef_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % ef_clf.feature_importances_ return ef_predictedValue[:, 1]
def et_model(X_train, y_train, X_test, y_test=None): #ExtraTree model = ExtraTreesClassifier(max_features='log2',n_estimators=1000,n_jobs=1).fit(X_train,y_train) predict = model.predict_proba(X_test)[:,1] minmin = min(predict) maxmax = max(predict) vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) return vfunc(predict)
def extra(X, y, test): clf = ExtraTreesClassifier(n_estimators=250, max_depth=9, min_samples_split=6) clf.fit(X, y) return clf.predict_proba(test)[:, 1]
def et_prob(train, label, test): extratree = ExtraTreesClassifier(n_estimators=300, max_depth=None, max_features="auto", n_jobs=-1, random_state=2017, verbose=0) extratree.fit(train, label) predict = extratree.predict_proba(test) extratree = [] del extratree return predict
def extra_tree(self): if not 'et' in self.params['model']['model_list']: raise Exception('Extra Tree Classifier not listed in the model list parameter') space = self.params clf = ExtraTreesClassifier(n_estimators=self.params['model']['rf_params']['n_estimators'], max_depth=self.params['model']['dt_params']['max_depth'], min_samples_split=self.params['model']['dt_params']['min_samples_split'], min_samples_leaf=self.params['model']['dt_params']['min_samples_leaf'], min_weight_fraction_leaf=self.params['model']['dt_params']['min_weight_fraction_leaf'], max_features=self.params['model']['dt_params']['max_features'], max_leaf_nodes=self.params['model']['dt_params']['max_leaf_nodes'], min_impurity_decrease=self.params['model']['rf_params']['min_impurity_decrease'], class_weight=self.params['model']['dt_params']['class_weight']) if self.params['cross_validation']['time_based_test_split']: X_train, X_test, y_train, y_test = self.split_data_into_train_test_time_based() else: X_train, X_test, y_train, y_test = self.split_data_into_train_test() clf.fit(X_train, y_train) pred = clf.predict(X_test) pred_prob = clf.predict_proba(X_test) pred_prob = [x[1] for x in pred_prob] predictions = pd.DataFrame({'id': self.id_vals_test, 'date': self.date_vals_test, 'prob': pred_prob, 'status': y_test}) pred_prob_train = clf.predict_proba(X_train) pred_prob_train = [x[1] for x in pred_prob_train] predictions_train = pd.DataFrame({'id': self.id_vals_train, 'date': self.date_vals_train, 'prob': pred_prob_train, 'status': y_train}) if not os.path.exists(self.params['info']['base_dir']+'models/'): os.mkdir(self.params['info']['base_dir']+'models/') if not os.path.exists(self.params['info']['base_dir']+'predictions/'): os.mkdir(self.params['info']['base_dir']+'predictions/') joblib.dump(clf, self.params['info']['base_dir'] + 'models/' + 'trained_model_et.pkl') predictions.to_csv(self.params['info']['base_dir']+'predictions/'+'predictions_et_test.csv', index = False) predictions_train.to_csv(self.params['info']['base_dir']+'predictions/'+'predictions_et_train.csv', index = False) print('Saved trained model (Extra Trees): {}'.format(self.params['info']['base_dir'] + 'models/' + 'trained_model_et.pkl')) print('Written test predictions (Extra Trees): {}'.format(self.params['info']['base_dir']+'predictions/'+'predictions_et_test.csv')) print('Written train predictions (Extra Trees): {}'.format(self.params['info']['base_dir']+'predictions/'+'predictions_et_train.csv')) accuracy = accuracy_score(y_test, pred) return {'loss':-accuracy, 'status': STATUS_OK }
def et(train_data, train_label, val_data, val_label, test_data, name="extratrees_submission.csv"): print "start training ExtraTrees..." etClf = ExtraTreesClassifier(n_estimators=10) etClf.fit(train_data, train_label) #evaluate on validation set val_pred_label = etClf.predict_proba(val_data) logloss = preprocess.evaluation(val_label, val_pred_label) print "logloss of validation set:", logloss print "Start classify test set..." test_label = etClf.predict_proba(test_data) preprocess.saveResult(test_label, filename=name)
def load_train_data(train, train_y, ttf): X = train.copy() y = np.array(train_y[offset:LINES].copy(), dtype = np.int32) rbm0 = ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=5, n_jobs = 8).fit(X[0:offset-1,:], train_y[0:offset-1]) #rbm1 = xgb.XGBClassifier(n_estimators=200,max_depth=6,subsample=0.8,min_child_weight = 2, nthread=8).fit(X[0:offset-1,:], train_y[0:offset-1]) rbm1 = KNeighborsClassifier(n_neighbors = 5).fit(X[0:offset-1,:], train_y[0:offset-1]) rbm2 = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='auto', bootstrap=False, oob_score=False, n_jobs=8, verbose=1).fit(X[0:offset-1,:], train_y[0:offset-1]) rbm3 = xgb.XGBClassifier(n_estimators=300,max_depth=8,subsample=0.8,min_child_weight=4,nthread=8).fit(X[0:offset-1,:], train_y[0:offset-1]) X = np.hstack([X[offset:LINES,:], rbm0.predict_proba(X[offset:LINES,:]), rbm1.predict_proba(X[offset:LINES,:]), np.power(rbm2.predict_proba(X[offset:LINES,:])*rbm3.predict_proba(X[offset:LINES,:]), (1/2.0)) ] ) return np.array(X, dtype = np.float32), y, rbm0, rbm1, rbm2, rbm3
def extratree_cla(train_data, train_id, test_data, seed = None): clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=4, random_state= seed)#, max_features="log2") param_grid = { 'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2'] } clf.fit(train_data, train_id) pred_class = clf.predict(test_data) pred_prob = clf.predict_proba(test_data) return pred_class, pred_prob
def build_XT1(X_train,y_train,X_cal,y_cal,X_test): cal_prob,test_prob = 0,0 for i in range(3): print("--Building and Training model %s" % i) seed = randrange(1,10000) model = ExtraTreesClassifier(n_estimators=500,criterion="entropy",min_samples_split=1,random_state=seed,n_jobs=-1) model = CalibratedClassifierCV(base_estimator=model,method='isotonic',cv=5).fit(X_train,y_train) print("Model %s training complete." % i) test_prob += model.predict_proba(X_test) test_prob = test_prob/3. return(cal_prob,test_prob)
def extraTree(X, y, train, valid): clf = ExtraTreesClassifier(n_jobs = -1, n_estimators = 300, verbose = 2, random_state = 1, max_depth = 10, bootstrap = True) clf.fit(X[train], y[train]) yhat = clf.predict(X[valid]) yhat_prob = clf.predict_proba(X[valid])[:,1] print("extra tree randomForest" + str(accuracy_score(y[valid], yhat))) print(classification_report(y[valid], yhat)) print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob))) np.savetxt("y_extratree.csv", yhat_prob) return yhat_prob
def train_predict(X_train, X_test, y_train, y_test, model_name, param): if model_name == 'clf_xgb_tree': ''' if y_test: dtest_base = xgb.DMatrix(X_test, label=y_test) else: dtest_base = xgb.DMatrix(X_test) dtrain_base = xgb.DMatrix(X_train, label=y_train) watchlist = [] #watchlist = [(dtrain_base, 'train'), (dtest_base, 'valid')] bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=xgb_loss) pred_test = bst.predict(dtest_base) ''' print 'no xgboost' elif model_name == "clf_skl_lr": lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, random_state=param['random_state']) lr.fit(X_train, y_train) pred_test = lr.predict(X_test) elif model_name == 'clf_skl_etr': etr = ExtraTreesClassifier(n_estimators=int(param['n_estimators']), max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train, y_train) pred_test = etr.predict_proba(X_test)[:,1] elif model_name == 'clf_skl_rf': rf = RandomForestClassifier(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train, y_train) pred_test = rf.predict_proba(X_test)[:,1] elif model_name == 'clf_skl_gbm': gbm = GradientBoostingClassifier(n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train, y_train) pred_test = gbm.predict_proba(X_test)[:,1] return pred_test
def load_train_data(r_seed): X_train,X_valid,Y_train,Y_valid=train_test_split(train[train_features],train['Response'],test_size=0.8,random_state=r_seed) rbm1 = ExtraTreesClassifier(n_estimators=500, max_features=0.4, n_jobs=32, random_state=jj,verbose=1).fit(X_train,Y_train) rbm2 = RandomForestClassifier(n_estimators=300, max_features=0.28,n_jobs=32, verbose=1,random_state=jj).fit(X_train,Y_train) rbm3 = GradientBoostingClassifier(n_estimators=48,max_depth=11,subsample=0.8,min_samples_leaf=5,verbose=1,random_state=jj).fit(X_train,Y_train) res_mean = rbm1.predict_proba(X_valid)+rbm2.predict_proba(X_valid)+rbm3.predict_proba(X_valid) res_mean = res_mean /3.0 feats = ['new_feat_%d'%(i) for i in range(1,9)] new_data = pd.DataFrame(res_mean,columns=feats) new_data.index = X_valid.index all_data = pd.concat([X_valid,new_data],axis=1) print all_data.shape return all_data,Y_valid,rbm1,rbm2,rbm3
class Model_ETC: def __init__(self, trainX, trainY, seed): self.model = ExtraTreesClassifier( n_estimators=500, random_state=seed ) if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]: trainX = trainX.toarray() self.model.fit(trainX, trainY) def predict(self, testX): if type(testX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]: testX = testX.toarray() predictions = self.model.predict_proba(testX)[:,1] return predictions
def etc_level2(train_x, train_y, test_x, seed): clf2 = ExtraTreesClassifier( n_estimators=1000, max_features=50, criterion='entropy', min_samples_split=4, max_depth=35, min_samples_leaf=2, n_jobs=-1, random_state=seed, verbose=2 ) clf2.fit(train_x, train_y) pred = clf2.predict_proba(test_x).astype(np.float32) return pred
def cross_val(clf_name, X, y, n_folds=5, proba=False, score=accuracy_score, *params, **kwargs): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41) if clf_name == "extra": c = ExtraTreesClassifier(12, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs) elif clf_name == "grad": c = GradientBoostingClassifier(n_estimators=40, learning_rate=0.1, *params, **kwargs) elif clf_name == "cgrad": c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(n_estimators = 20,learning_rate= 0.1, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "cmulti": c = CalibratedClassifierCV(base_estimator=MultinomialNB(alpha = alpha_multi, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "multi": c = MultinomialNB(*params, **kwargs) elif clf_name == "bag": c = BaggingClassifier(base_estimator=MultinomialNB(alpha = 0.5, *params, **kwargs),n_estimators = 100,n_jobs = -1) elif clf_name == "bern": c = BernoulliNB(alpha=0.00000000001, *params, **kwargs) elif clf_name == "gauss": c = GaussianNB(*params, **kwargs) elif clf_name == "random": c = RandomForestClassifier(1200,max_depth= 23,max_features = 10,n_jobs = -1, *params, **kwargs) elif clf_name == "lda": c = LinearDiscriminantAnalysis(*params, **kwargs) elif clf_name == "logistic": c = LogisticRegression(C=1, *params, **kwargs) elif clf_name == "svm": c = LinearSVC(C=100, *params, **kwargs) elif clf_name == "knn": c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs) elif clf_name == "near": c = NearestCentroid(*params, **kwargs) elif clf_name == "ridge": c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs)) elif clf_name == "sgd": c = SGDClassifier(loss="hinge", penalty="l2", n_iter=50, alpha=0.000001, fit_intercept=True, average=True) y_pred = np.zeros(y.shape) score_list = [] for i, (train, test) in enumerate(cv): c.fit(X[train,:], y[train]) if proba: y_pred[test] = c.predict_proba(X[test,:]) else: y_pred[test] = c.predict(X[test,:]) score_list.append(score(y[test], y_pred[test])) print(score_list[i]) print("Final score",score(y,y_pred)) return y_pred
class MyExtraTree(MyClassifier): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesClassifier(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesClassifier(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) # def predict(self, Xtest, option = None): # return self._extree.predict(Xtest) def predict_proba(self, Xtest, option = None): return self._extree.predict_proba(Xtest)[:, 1]
def etclassifier(training_samples, eval_samples, do_grid_search=True): X_train, Y_train = training_samples X_eval, Y_eval = eval_samples clf = ExtraTreesClassifier(max_depth=None, n_estimators=1000, min_weight_fraction_leaf=0.0, max_features=None, min_samples_split=16, criterion='gini', min_samples_leaf=2, max_leaf_nodes=None, oob_score=False, bootstrap=True, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) to_be_tuned_parameters = { #'n_estimators':[500, 2000, 4000], 'max_features':['log2', 'auto', None], 'min_samples_split':[2, 8, 16], 'min_samples_leaf': [1, 2], } if do_grid_search: clf = GridSearchCV(clf, to_be_tuned_parameters, cv=5, n_jobs=5, scoring='log_loss') #Best parameters set found on development set: #() #{'max_features': None, 'min_samples_split': 10, 'n_estimators': 1000, 'min_samples_leaf': 2} print(clf) clf.fit(X_train, Y_train) if do_grid_search: print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) else: scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss') print scores, np.mean(scores), np.median(scores) Y_eval = clf.predict(X_eval) Y_prob = clf.predict_proba(X_eval) return Y_eval, Y_prob
def testET(X_train, X_test, y_train, y_test): """ Train a extra-trees classifier and make predictions for test data. :param X_train: training data :param X_test: test data :param y_train: training labels :param y_test: test labels :return: predictions for the test data """ clf = ExtraTreesClassifier(n_estimators=1000, max_features=5, n_jobs=-1, verbose=False) clf.fit(X_train, y_train) yhat = clf.predict_proba(X_test)[:, 1] auc = metrics.roc_auc_score(y_test, yhat) print('ET AUC:', auc) return yhat
def main(): ## read training dataset traindataset = pd.read_csv('/usr3/graduate/xysun/walmart/traindata.csv') target = traindataset['TripType'] traindata_feature = traindataset.drop(['TripType','VisitNumber'],1) etc = ExtraTreesClassifier(n_estimators=500,bootstrap=False, n_jobs=-1) etc = etc.fit(traindata_feature, target) ##test models testdataset = pd.read_csv('/usr3/graduate/xysun/walmart/testdata.csv') testdata = testdataset.set_index('VisitNumber') result = etc.predict_proba(testdata) etc_csv = pd.DataFrame(result[0:,0:], index=testdata.index) etc_csv.index.name = "VisitNumber" etc_csv.columns = ["TripType_3","TripType_4","TripType_5","TripType_6","TripType_7","TripType_8","TripType_9","TripType_12","TripType_14","TripType_15","TripType_18","TripType_19","TripType_20","TripType_21","TripType_22","TripType_23","TripType_24","TripType_25","TripType_26","TripType_27","TripType_28","TripType_29","TripType_30","TripType_31","TripType_32","TripType_33","TripType_34","TripType_35","TripType_36","TripType_37","TripType_38","TripType_39","TripType_40","TripType_41","TripType_42","TripType_43","TripType_44","TripType_999"] etc_csv.to_csv('/usr3/graduate/xysunn/walmart/etc_csv.csv',header=True, index=True,delimiter=',')
def cross_val(training_df,frac): # train_cv, test_cv = shuffle_and_sample(training_df,frac) # rf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs = -1) rf.fit(train_cv[features], train_cv["sponsored0"]) rf_pred = rf.predict_proba(test_cv[features])[:,1] del rf # et = ExtraTreesClassifier(n_estimators=100, random_state=1, n_jobs = -1) et.fit(train_cv[features], train_cv["sponsored0"]) et_pred = et.predict_proba(test_cv[features])[:,1] del et # test_probs = (rf_pred + et_pred)/2 true_labels = test_cv["sponsored0"].values aucscore=roc_auc_score(true_labels,test_probs) return aucscore