def get_model(rc="r", model="LIN"): if rc == "r": ## REGRESSORS if model == "LIN": rgrsr = linear_model.LinearRegression() elif model == "LASSO": rgrsr = linear_model.LassoLars(alpha=0.0) elif model == "RF": rgrsr = ensemble.RandomForestRegressor(n_estimators=120, bootstrap=True, max_depth=None, oob_score=True) elif model == "RFX": rgrsr = ensemble.ExtraTreesRegressor(n_estimators=120, bootstrap=True, max_depth=None, oob_score=True, max_features=None) elif model == "RFXX": rgrsr = ensemble.ExtraTreesRegressor(n_estimators=12, bootstrap=True, max_depth=None, max_features=None) elif model == "SV": rgrsr = svm.SVR(epsilon=0.0, tol=0.1) else: raise RuntimeError("Invalid model [%s]" % (model, )) if rc == "c": ## CLASSIFIERS if model == "LIN": rgrsr = LinearDiscriminantAnalysis() elif model == "RF": rgrsr = ensemble.RandomForestClassifier(n_estimators=120, bootstrap=True, max_depth=None, oob_score=True) elif model == "RFX": rgrsr = ensemble.ExtraTreesClassifier(n_estimators=120, bootstrap=True, max_depth=None, oob_score=True, max_features=None) elif model == "RFXX": rgrsr = ensemble.ExtraTreesClassifier(n_estimators=12, bootstrap=True, max_depth=None, max_features=None) elif model == "SV": rgrsr = svm.SVC() else: raise RuntimeError("Invalid model [%s]" % (model, )) return rgrsr
def _init_model(self, parms=None): """Set ML model""" from sklearn import ensemble if self.args.extreme: if parms is not None: return 'ExtremeRF', ensemble.ExtraTreesClassifier(**parms) return 'ExtremeRF', ensemble.ExtraTreesClassifier() else: if parms is not None: return 'RF', ensemble.RandomForestClassifier(**parms) return 'RF', ensemble.RandomForestClassifier()
def get_alg(alg, mdl_config): if alg == 'skrf': clf = ensemble.RandomForestClassifier( n_estimators=mdl_config.get('n_estimators', 500), max_features=mdl_config.get('max_features', 0.35), max_depth=mdl_config.get('max_depth', 15), n_jobs=-1, ) elif alg == 'skrfp': clf = ensemble.RandomForestClassifier( n_estimators=mdl_config.get('n_estimators', 500), max_features=mdl_config.get('max_features', 0.35), max_depth=mdl_config.get('max_depth', 15), criterion='entropy', n_jobs=-1, ) elif alg == 'sket': clf = ensemble.ExtraTreesClassifier( n_estimators=mdl_config.get('n_estimators', 500), max_features=mdl_config.get('max_features', 0.5), max_depth=mdl_config.get('max_depth', 15), n_jobs=-1, ) elif alg == 'sketp': clf = ensemble.ExtraTreesClassifier( n_estimators=mdl_config.get('n_estimators', 500), max_features=mdl_config.get('max_features', 0.5), max_depth=mdl_config.get('max_depth', 11), criterion='entropy', n_jobs=-1, ) elif alg == 'skgbc': clf = ensemble.GradientBoostingClassifier( n_estimators=mdl_config.get('n_estimators', 30), max_depth=mdl_config.get('max_depth', 5)) elif alg == 'xgb': # https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py clf = xgb.XGBClassifier( n_estimators=mdl_config.get('n_estimators', 30), max_depth=mdl_config.get('max_depth', 7), learning_rate=mdl_config.get('learning_rate', 0.1), objective="multi:softprob", silent=True) elif alg == 'knn': clf = KNeighborsClassifier(n_neighbors=25, weights='distance', metric='manhattan', n_jobs=-1) elif alg == 'sklr': clf = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs') return clf
def make_etc_model(train_df, synergy_score, n_estimators, max_features): from sklearn import ensemble model = ensemble.ExtraTreesClassifier(n_estimators=n_estimators, max_features=max_features, n_jobs=-1) model = fit(model, train_df, synergy_score) return model
def find_best_features_extratree(self, participants, X, calibrations, y, **kwargs): from sklearn import ensemble, preprocessing default_fields =\ utils.all_body_fields() + utils.all_body_orientation_fields() fields = kwargs.get('fields', default_fields) load_features = kwargs.get('load_features', False) if load_features: X = self.load_all_features(participants, X, calibrations, y, include=fields) X = X[self.add_features] columns = copy.deepcopy(X.columns) with pd.option_context('mode.use_inf_as_na', True): X = X.fillna(X.mean()) labelEncoder = preprocessing.LabelEncoder() y['target'] = y[utils.target_fields()]\ .apply(lambda xs: str(tuple(xs)), axis=1) y = labelEncoder.fit_transform(y['target']) scaler = StandardScaler() X = scaler.fit_transform(X) model = ensemble.ExtraTreesClassifier() model.fit(X, y) feat_importances = pd.Series(model.feature_importances_, index=columns) return feat_importances.sort_values()
def random_forest(training_feature,training_target,test_feature,test_target): values=range(10,210,10) parameters=[{'n_estimators':values}] ''' dt=grid_search.GridSearchCV(ensemble.RandomForestClassifier(), parameters, cv=5,scoring="accuracy",n_jobs=6) dt.fit(training_feature,training_target) print dt.score(training_feature, training_target) print dt.score(test_feature, test_target) #print dt.best_estimator_ #model=dt.best_estimator_ #importances=model.feature_importances_ #for item in importances: # print item values_small=range(1,3) parameters=[{'n_estimators':values,'learning_rate':values_small}] dt=grid_search.GridSearchCV(ensemble.AdaBoostClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6) dt.fit(training_feature,training_target) print dt.score(training_feature, training_target) print dt.score(test_feature, test_target) print dt.best_estimator_ ''' parameters=[{'n_estimators':values}] dt=grid_search.GridSearchCV(ensemble.ExtraTreesClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6) dt.fit(training_feature,training_target) #print dt.score(training_feature, training_target) #print dt.score(test_feature, test_target) #print dt.best_estimator_ model=dt.best_estimator_ importances=model.feature_importances_ for item in importances: print item
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0): clf = ensemble.ExtraTreesClassifier(n_estimators=n_est_val, max_depth=depth_val, min_samples_split=split_val, min_samples_leaf=leaf_val, max_features=feat_val, criterion='entropy', n_jobs=jobs_val, random_state=random_state_val) clf.fit(train_X, train_y) pred_train_y = clf.predict_proba(train_X)[:, 1] pred_test_y = clf.predict_proba(test_X)[:, 1] if validation: train_loss = log_loss(train_y, pred_train_y) loss = log_loss(test_y, pred_test_y) print "Train, Test loss : ", train_loss, loss return pred_test_y, loss else: return pred_test_y
def learn(feat_set, y, method='LogReg'): """ Training of different classifiers """ if method == 'mbK-means': clasf = cluster.MiniBatchKMeans(n_clusters=16, init='k-means++', max_iter=300, random_state=None).fit(feat_set) elif method == 'LogReg': clasf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced', solver='saga', multi_class='multinomial', warm_start='False', max_iter=100, n_jobs=-1).fit( feat_set, y.ravel()) elif method == 'ParForest': clasf = ensemble.ExtraTreesClassifier(n_estimators=1000, max_features=128, n_jobs=-1, random_state=0).fit( feat_set, y.ravel()) return clasf
def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32): """ Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train and pickle a scikit-learn model. Parameters ---------- sig_csv_path The path to the signatures file model_out The location to save the pickled model to. sig_datatype The datatype to read the csv as. Defaults to int32. Notes ----- At present, the model is an ExtraTreesClassifier arrived at by tpot: model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') """ model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') features, labels = load_signatures(sig_csv_path, sig_datatype) model.fit(features, labels) joblib.dump(model, model_out)
def ExtraTrees(X_train, X_test, y_train, y_test): for i in range(len(X_train)): rf = ensemble.ExtraTreesClassifier() rf.fit(X_train[i], y_train[i]) X_train[i] = rf.score(X_train[i], y_train[i]) X_test[i] = rf.score(X_test[i], y_test[i]) return X_train, X_test
def run(self): self.output().makedirs() X, y, cols = self.load_data('train') weights = dict(enumerate(core.weights)) cls = ensemble.ExtraTreesClassifier( n_estimators=self.n_trees, n_jobs=-1, verbose=10, bootstrap=True, min_samples_leaf=10, oob_score=False, class_weight=weights) cls.fit(X, y) importances = pandas.Series( cls.feature_importances_, index=cols) importance_frame = importances.groupby([ix.split('.')[0] for ix in importances.index]) importance_aggs = importance_frame.agg(['mean', 'max', 'min', 'sum']) report_data = str(importance_aggs.sort_values('sum')) print(report_data) X, y, _ = self.load_data('valid') preds = cls.predict_proba(X)[:, 1] weights = core.weights[y] loss = metrics.log_loss(y, preds, sample_weight=weights) print(colors.green | str(loss)) X, y, _ = self.load_data('merge') merge_pred = cls.predict_proba(X)[:, 1] pandas.Series(merge_pred).to_csv('cache/XTC_%s/merge_predictions.csv' % self.base_name) X, y, _ = self.load_data('test') pred = cls.predict_proba(X)[:, 1] pandas.Series(pred).to_csv('cache/XTC_%s/predictions.csv' % self.base_name) with self.output().open('w') as f: f.write(report_data)
def train_time(name, train_samples, attribute_candidates, label_attribute): X_train = train_samples[attribute_candidates].values y_train = train_samples[label_attribute].values t = time.process_time() clf_sklearn = tree.DecisionTreeClassifier() clf_sklearn.fit(X_train, y_train) dt_train_time = time.process_time() - t t = time.process_time() clf_sklearn_rf = ensemble.RandomForestClassifier() clf_sklearn_rf.fit(X_train, y_train) rf_train_time = time.process_time() - t t = time.process_time() etd_sklearn = ensemble.ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_leaf=2, max_features='sqrt') etd_sklearn.fit(X_train, y_train) etd_train_time = time.process_time() - t print(f'{name},decision_tree,{int(dt_train_time * 1000)}') print(f'{name},random_forest,{int(rf_train_time * 1000)}') print(f'{name},extremely_randomized_trees,{int(etd_train_time * 1000)}')
def forget(name, train_samples, attribute_candidates, label_attribute): train_samples = train_samples.sample(frac=1).reset_index(drop=True) repetitions = int(len(train_samples) / 1000) for _ in range(0, repetitions): train_samples = train_samples.iloc[1:] X_train = train_samples[attribute_candidates].values y_train = train_samples[label_attribute].values t = time.process_time() clf_sklearn = tree.DecisionTreeClassifier() clf_sklearn.fit(X_train, y_train) dt_train_time = time.process_time() - t t = time.process_time() clf_sklearn_rf = ensemble.RandomForestClassifier() clf_sklearn_rf.fit(X_train, y_train) rf_train_time = time.process_time() - t t = time.process_time() etd_sklearn = ensemble.ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_leaf=2, max_features='sqrt') etd_sklearn.fit(X_train, y_train) etd_train_time = time.process_time() - t print(f'{name},decision_tree,{int(dt_train_time * 1000000)}') print(f'{name},random_forest,{int(rf_train_time * 1000000)}') print( f'{name},extremely_randomized_trees,{int(etd_train_time * 1000000)}' )
def learning(dataset, blocking = True): classifier = ensemble.ExtraTreesClassifier(criterion='entropy') plot_x = [] plot_y_testing = [] plot_y_mean = [] train_sizes = np.linspace(0.1, 0.9, 10) train_size_abs, train_scores, test_scores = model_selection.learning_curve(classifier, dataset.training_features, dataset.training_labels, cv=10, train_sizes=train_sizes) train_losses = [1 - np.array(a).mean() for a in train_scores] test_losses = [1 - np.array(a).mean() for a in test_scores] plt.figure() plt.grid() plt.xlabel('Training Set Size') plt.ylabel('Loss') plt.title(TITLE) plt.plot(train_size_abs, train_losses) plt.plot(train_size_abs, test_losses) plt.legend(['Training', 'Testing']) if blocking: plt.show()
def chaos_feature_importance(x_all, y_feat, shadow_selector, feat_dic={}, **chaos_args): chaos_feat_iter = chaos_args.get('chaos_feat_iter', 10) chaos_n_estimators = chaos_args.get('chaos_n_estimators', 10) chaos_nb_features = chaos_args.get('chaos_nb_features', 30) chaos_gen_iter = chaos_args.get('chaos_gen_iter', 20) chaos_dummy_max = chaos_args.get('chaos_dummy_max', 20) ori_numcols = x_all.columns sel_numcols = [] for j in range(chaos_feat_iter): x_all = x_all[list(set(ori_numcols) | set(sel_numcols))] clf = ensemble.ExtraTreesClassifier(n_estimators=chaos_n_estimators, n_jobs=-1) numcols = get_num_cols(x_all) catcols = [c for c in x_all.columns if x_all[c].dtype.name == 'object'] chaos_gen(x_all, numcols, catcols, gen_iter=chaos_gen_iter, dummy_max=chaos_dummy_max) numcol2 = get_num_cols(x_all) x_feat = x_all[shadow_selector][numcol2] clf.fit(x_feat.replace(np.inf, 0).replace(-np.inf, 0).fillna(-1), y_feat) for f, v in get_clf_feat(clf, x_all, nb_feat=chaos_nb_features): sel_numcols.append(f) if f in feat_dic: feat_dic[f] += v else: feat_dic[f] = v
def extreme_randomize_applied(Train, New, treshold, bootstrap=False, n_estimators: int = 200, max_depth: int = 50, oob_score: bool = False, class_weight='balanced_subsample', sampling=None, label='FRAUDE'): yTrain = Train[[label]] xTrain = Train del xTrain[label] if sampling == None: pass elif sampling == 'ALLKNN': xTrain, yTrain = under_sampling(xTrain, yTrain) else: xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling) min_sample_leaf = round((len(xTrain.index)) * 0.005) min_sample_split = min_sample_leaf * 10 max_features = round(len(xTrain.columns) / 3) fileModel = ensemble.ExtraTreesClassifier( criterion='entropy', bootstrap=bootstrap, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split, n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, oob_score=oob_score, random_state=531, verbose=1, class_weight=class_weight, n_jobs=1) fileModel.fit( xTrain.drop(['id_siniestro'], axis=1).values, yTrain.values) y_hat_New = fileModel.predict_proba( New.drop('id_siniestro', axis=1).values) y_hat_New = np.delete(y_hat_New, 0, axis=1) df_proba = pd.DataFrame(y_hat_New, columns=['probabilidad'], index=New.index) df_proba = pd.concat([New['id_siniestro'], df_proba], axis=1) # y_random = (y_hat_New <= treshold).astype(int) y_hat_New = (y_hat_New > treshold).astype(int) print(y_hat_New) df_proba_threshold = pd.DataFrame(y_hat_New, columns=['Treshold'], index=New.index) print(df_proba_threshold) df_proba_conc = pd.concat([df_proba, df_proba_threshold], axis=1) print(df_proba_conc) return df_proba_conc
def ReduceClassesFeature( class_id1, class_id2, data_X_1, data_X_2, importance_treshold): #two 2D numpy array and corresponding class id X = np.vstack((data_X_1, data_X_2)) y = np.asarray([class_id1] * len(data_X_1) + [class_id2] * len(data_X_2)) ET_classfier = ensemble.ExtraTreesClassifier() ET_classfier.fit(X, y) importance = ET_classfier.feature_importances_ feature_select_mask = (importance > importance_treshold) X = [] y = [] data_X_1 = data_X_1[:, feature_select_mask] data_X_2 = data_X_2[:, feature_select_mask] remainder_ratio = 1 if class_id1 < 2 and class_id2 > 1: #class_id1 = 0,1 class_id2 = 2,3,4 #very imbalance data #sample the big set rand_idx = np.random.permutation(len(data_X_1)) data_X_1 = data_X_1[rand_idx[0:remainder_ratio * len(data_X_2)], :] elif class_id2 < 2 and class_id1 > 1: #class_id2 = 0,1 class_id1 = 2,3,4 (not happend in the one vs one models) rand_idx = np.random.permutation(len(data_X_2)) data_X_2 = data_X_2[rand_idx[0:remainder_ratio * len(data_X_1)], :] elif class_id1 == 0 and class_id2 == 1: rand_idx = np.random.permutation(len(data_X_1)) data_X_2 = data_X_2[rand_idx[0:remainder_ratio * len(data_X_1)], :] print 'reduct shape', np.shape(data_X_1) print 'mask', feature_select_mask return [data_X_1, data_X_2, ET_classfier, feature_select_mask]
def setUp(self): self.modellist = [ lgb.LGBMRegressor(n_estimators=1), lgb.LGBMClassifier(n_estimators=1), xgb.XGBRegressor(n_estimators=1), xgb.XGBRegressor(n_estimators=1), svm.SVR(kernel='linear'), svm.SVC(kernel='linear'), cb.CatBoostRegressor(n_estimators=1), cb.CatBoostClassifier(n_estimators=1), ske.GradientBoostingRegressor(n_estimators=1), ske.GradientBoostingClassifier(n_estimators=1), ske.ExtraTreesRegressor(n_estimators=1), ske.ExtraTreesClassifier(n_estimators=1), ske.RandomForestRegressor(n_estimators=1), ske.RandomForestClassifier(n_estimators=1), skl.LogisticRegression(), skl.LinearRegression() ] df = pd.DataFrame(range(0, 21), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 10 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = np.random.randint(1, 3, df.shape[0]) df = df.set_index('id') self.x_df = df[['x1', 'x2']] self.y_df = df['y'].to_frame()
def model_training(complete_tag_count, prediction, predicted_class, nonpredicted_class, language): tag_total = np.array(complete_tag_count) clf = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy', n_jobs=9) """if prediction == 'age': cv = cross_validation.StratifiedKFold(predicted_final, 10) results = cross_validation.cross_val_predict(clf, tag_total, nonpredicted_final, cv=cv) final_tags = [] for i in range(len(tag_total)): user = tag_total[i] user_gender = results[i] if user_gender == 'M' or user_gender == 'MALE': g = 0 elif user_gender == 'F' or user_gender == 'FEMALE': g = 1 user = np.append(user, g) final_tags.append(user) else: final_tags = tag_total""" final_tags = tag_total clf.fit(final_tags, predicted_class) with open('models/'+language+'-'+prediction+'-base.p', 'wb') as clffile: pickle.dump(clf, clffile)
def extra_trees(x_train, y_train, x_test, cleaned_features): clf = ensemble.ExtraTreesClassifier(random_state=seed) clf.fit(x_train, y_train) predicted = clf.predict(x_test) feature_df = pd.DataFrame(columns=cleaned_features) feature_df.loc['feature_importances'] = clf.feature_importances_ return predicted, feature_df, clf
def select_features(x,y,data): ''' Use a tree classifier to select the most relevent features from data.csv 70%-30% train-test split for purposes of cross validation. ''' feature_select = ske.ExtraTreesClassifier().fit(x,y) model = SelectFromModel(feature_select, prefit=True) x_new = model.transform(x) nb_features = x_new.shape[1] x_train, x_test, y_train, y_test = model_selection.train_test_split(x_new, y, test_size=0.3) features = [] # print('{} features were selected as being important:'.format(nb_features)) indices = numpy.argsort(feature_select.feature_importances_)[::-1][:nb_features] col_width = len(max(data.columns[2+indices[f]] for f in range(nb_features))) + 5 imp_features = [] for f in range(nb_features): number = f+1 feature_name = ''.join(data.columns[2+indices[f]].ljust(col_width)) feature_importance = feature_select.feature_importances_[indices[f]] # print('{}.\t{} {}'.format(number, feature_name, (feature_importance * 100))) imp_features.append([str(feature_name).strip(), feature_importance * 100]) for f in sorted(numpy.argsort(feature_select.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2+f]) # pp(imp_features) # return x_train, x_test, y_train, y_test, features return imp_features
def rank_features_etc(X, Y, columns): # supervised ranking model = ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=0) model.fit(X, Y) #list feature importance importances = model.feature_importances_ std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) indices = np.argsort(importances)[::-1] #indices = indices[:25] #limit to 25 # Print the feature ranking #print("Feature ranking:") #for f in range(X.shape[1] - 1): # print(f, 'index', indices[f], X.columns[indices[f]], importances[indices[f]]) # Plot the feature importances of the forest plt.figure(figsize=(15, 5)) plt.title("ETC Feature Importances") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), np.array(columns)[indices], rotation='vertical') plt.xlim([-1, X.shape[1] + 1]) plt.show() return indices
def model_init(self, reDefine_clf={}, used_clf=[]): clfs = { 'xgb': xgboost.XGBClassifier(n_estimators=100, max_depth=3), 'lgb': lightgbm.LGBMClassifier(n_estimators=100, max_depth=3), 'gbdt': ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=3), 'rf': ensemble.RandomForestClassifier(n_estimators=200, max_depth=6), 'logit': linear_model.LogisticRegression(), 'svc': svm.SVC(probability=True), 'adbt': ensemble.AdaBoostClassifier( base_estimator=tree.DecisionTreeClassifier(max_depth=3)), 'bagg': ensemble.BaggingClassifier(n_estimators=100), 'ext': ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=3), # 'perceptron':linear_model.Perceptron(), 'dt': tree.DecisionTreeClassifier(), 'knn': neighbors.KNeighborsClassifier(), 'network': neural_network.MLPClassifier() } clfs.update(reDefine_clf) if reDefine_clf else clfs if used_clf: return dict( (key, value) for key, value in clfs.items() if key in used_clf) else: return clfs
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3): model = ensemble.ExtraTreesClassifier(n_estimators=300, max_depth=depth, min_samples_split=10, min_samples_leaf=leaf, max_features=feat, n_jobs=6, random_state=0) model.fit(train_X, train_y) train_preds = model.predict_proba(train_X)[:, 1] test_preds = model.predict_proba(test_X)[:, 1] test_preds2 = model.predict_proba(test_X2)[:, 1] test_loss = 0 if test_y is not None: train_loss = metrics.roc_auc_score(train_y, train_preds) test_loss = metrics.roc_auc_score(test_y, test_preds) print "Depth, leaf, feat : ", depth, leaf, feat print "Train and Test loss : ", train_loss, test_loss return test_preds, test_loss, test_preds2
def feature_importance(trainX, trainY, testX, testY, columns): """ Calculates the feature importance on the training set for a given set of variables It prints this importance and plots it """ ## Feature selection clf = ensemble.ExtraTreesClassifier(random_state=1729, n_estimators=250, n_jobs=-1) selector = clf.fit(trainX, trainY) importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(trainX.shape[1]): print("%d. %s (%f)" % (f + 1, columns[indices[f]], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(trainX.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(trainX.shape[1]), indices) plt.xlim([-1, trainX.shape[1]]) plt.show()
def ensemble_models(): # Here let's implement some ensemble methods to potentially improve accuracy # And get a better idea of the inherent structure of the data models = np.empty([4, 2], dtype = object) # Boosting ensembles models[0] = ['Gradient Boosted Machine', ensemble.GradientBoostingClassifier(random_state = 1)] models[1] = ['AdaBoost Classifier', ensemble.AdaBoostClassifier(random_state = 1)] # Bagging Ensembles # Even though the decision tree didn't do so well, a random forest might n_trees = 100 models[2] = ['Random Forest', ensemble.RandomForestClassifier(n_estimators = n_trees, max_features = 3, random_state = 1)] models[3] = ['Extra Trees Classifier', ensemble.ExtraTreesClassifier(n_estimators = n_trees, max_features = 3, random_state = 1)] # Fit & evaluate models for name, model in models: # Different model metrics for scoring in ('accuracy', 'roc_auc'): cross_validation(name, model, X, Y, scoring) # Fit model and make predictions fitted_model = model.fit(X_train, Y_train) Y_pred = fitted_model.predict(X_test) # Classification report & Confusion Matrix classification_report(name, Y_test, Y_pred) confusion_matrix(name, Y_test, Y_pred) """
def get_curve_per_model(model): _models = [model.trained_classifier] runtime = [] for k in [25, 50, 100, 200]: if model.algorithm == 'RF': base_model = ensemble.RandomForestClassifier(n_jobs=4, verbose=1) elif model.algorithm == 'ET': base_model = ensemble.ExtraTreesClassifier(n_jobs=4, verbose=1) for i in model.param_ranges.keys(): setattr(base_model, i, model.trained_classifier.get_params()[i]) setattr(base_model, 'n_estimators', k) print base_model start = timeit.default_timer() base_model.fit(model.X_train, model.Y_train, model.W_train) elapsed = timeit.default_timer() runtime.append(elapsed - start) _models.append(base_model) curves = [] for m in _models: print 'Constructing AMS curve for models with n_estimators (this could take several minutes)' train_score = m.predict_proba(model.X_train)[:, 1] test_score = m.predict_proba(model.X_test)[:, 1] curve, thresh = ams_vs_cutoff(model.X_test, model.Y_test, model.W_test, train_score, test_score) curves.append(curve) return curves, thresh, runtime
def cross_validate_model(X_train, Y_train): """ Here we perform cross validation of models to choose the best one. """ # Divide the training and testing data train, test, y_actual, y_predict = train_test_split(X_train, Y_train, test_size=0.5, random_state=41) train_n, test_n, y_actual_n, y_predict_n = train_test_split(X_train, Y_train, test_size=0.5, random_state=0) # Add one hot encoder rf = ensemble.RandomForestClassifier(n_estimators=50, max_depth=5) rf_enc = OneHotEncoder() rf_lm = sklinear.LogisticRegression() rf.fit(train, y_actual) rf_enc.fit(rf.apply(train)) rf_lm.fit(rf_enc.transform(rf.apply(test)), y_predict) y_predict_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test_n))) mse_rf_lm = metrics.mean_squared_error(y_predict_n, y_predict_rf_lm[:, 1]) print('MSE RandomForestClassifier followed by LogisticRegression is %f' % (mse_rf_lm)) # List the classification methods to use. clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis() clf_logreg = sklinear.LogisticRegression(penalty='l1') clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50, max_depth=10) clf_adaboost = ensemble.AdaBoostClassifier(n_estimators=50) clf_mlpc = neural_network.MLPClassifier() clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50, bootstrap=True) # Add the above methods in an array # More ameable for looping methods = [ clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc, clf_extra_tree ] methods_label = [ 'clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost', 'clf_mlpc', 'clf_extra_tree' ] method_mse = np.zeros((len(methods), 1)) # Fit and predict for each method for i in range(len(methods)): methods[i].fit(train, y_actual) method_predict = methods[i].predict_proba(test) method_mse[i] = metrics.mean_squared_error(y_predict, method_predict[:, 1]) print('MSE for %s while cross validation : %f' % (methods_label[i], method_mse[i])) # We return the method which has the minimum mse return np.argmin(method_mse)