def test_fit_predict_ensemble(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state)(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def train_ensemble(allele_table_path, amr_path, antibiotic, gene_path=None, core_cutoff=10, bootstrap_instances=0.8, bootstrap_features=0.5, num_models=500, log_iter=50): ''' Trains a random subspace ensemble of SVMs to predict AMR from the allele and/or gene content of a strain for a given antibiotic. Parameters ---------- allele_table_path : str Path to binary table containing allelic content of strains as CSV, where rows = alleles, columns = strains. Will interpret blanks as 0s. amr_path : str Path to table containing AMR profiles of strains as CSV, where rows = strains, columns = drugs. Will convert "Resistant" and "Intermediate" to 1s, "Susceptible" to 0s, and ignore blanks. antibiotic : str Drug to train model for, must be in columns of amr_path gene_path : str Similar to allele_table_path, but for gene content. If provided, will select model non-core alleles as defined by core_cutoff at the gene level, rather than each allele individually (default None) bootstrap_instances : float Fraction of total strains sampled for training, must be <1.0 (default 0.8) bootstrap_features : float Fraction of total genetic features sampled for training, uses all if 1.0 (default 0.5) num_models : int Number of individual models to train for ensemble (default 500) log_iter : int Print a message after this many models have been trained (default 50) Returns ------- ensemble : RSE Trained RSE object df_features : DataFrame DataFrame of binary matrix encoding genetic features of each strain df_amr : DataFrame DataFrame of binary vector encoding AMR phenotypes of each strain ''' ''' Reduce data to strains with AMR data for selected antibiotic ''' df_features, df_amr = __prepare_amr_data__(allele_table_path, amr_path, antibiotic, gene_path=gene_path, core_cutoff=core_cutoff) ''' Train and return ensemble ''' ensemble = RSE(num_models=num_models, bootstrap_instances=bootstrap_instances, bootstrap_features=bootstrap_features) ensemble.fit(df_features.values, df_amr.values) return ensemble, df_features, df_amr
def test_fit_predict_ensemble_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t) y2 = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state, function="apply" )(x, y_t) features = Concatenate(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = sklearn.ensemble.RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
#knn = KNeighborsClassifier(n_neighbors=5) #knn.fit(train_data, train_targets) #print('KNN Score: ', knn.score(test_data, test_targets)) ext_forest = ExtraTreesClassifier(n_estimators=200, n_jobs=-1) ext_forest.fit(train_data, train_targets) print('ExtraTree Score: ', ext_forest.score(test_data, test_targets)) #ext_forest_predicted = ext_forest.predict(test_data) #print_metrics(test_targets, ext_forest_predicted) randFor_class = ensemble.RandomForestClassifier(n_estimators=200, random_state=1) randFor_class.fit(train_data, train_targets) score = randFor_class.score(test_data, test_targets) print('RandomForest Score: ', randFor_class.score(test_data, test_targets)) #randFor_pred = ext_forest.predict(test_data) #print_metrics(test_targets, randFor_pred) ensemble = VotingClassifier(estimators=[('RandomForest', randFor_class), ('ExtraTrees', ext_forest)], voting='soft') ensemble.fit(train_data, train_targets) print('Ensamble Score: ', ensemble.score(test_data, test_targets)) #ensemble_predicted = ensemble.predict(test_data) #print_metrics(test_targets, ensemble_predicted) save_classifier(ext_forest, file_name='best_classifier')
breast_cancer = datasets.load_breast_cancer() offset = int(0.6*len(breast_cancer.data)) X_train = breast_cancer.data[:offset] Y_train = breast_cancer.target[:offset] X_test = breast_cancer.data[offset:] Y_test = breast_cancer.target[offset:] # Setup a Decision Tree Classifier so that it learns a tree with depth d classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=2, max_leaf_nodes=15, criterion='entropy', min_samples_leaf=1) ensemble = ensemble.AdaBoostClassifier(classifier, n_estimators=15, learning_rate=.5) start = time() # Fit the learner to the training data ensemble.fit(X_train, Y_train) end = time() print(("\nLearner took {:.4f} ").format(end - start)) # Find the MSE on the training set start = time() train_err = 1 - ensemble.score(X_train, Y_train) end = time() print(("\nTraining took {:.4f} ").format(end - start)) start = time() test_err = 1 - ensemble.score(X_test, Y_test) end = time() print(("\nTesting took {:.4f} ").format(end - start))
X_train = preprocessing.normalize(dft.ix[:offset, 1:]) Y_train = dft.ix[:offset, 0] X_test = preprocessing.normalize(dft.ix[offset:, 1:]) Y_test = dft.ix[offset:, 0] # Setup a Decision Tree Classifier so that it learns a tree with depth d classifier = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=1) ensemble = ensemble.AdaBoostClassifier(classifier, n_estimators=1, learning_rate=1.) start = time() # Fit the learner to the training data ensemble.fit(X_train, Y_train) end = time() print(("\nLearner took {:.4f} ").format(end - start)) # Find the MSE on the training set start = time() train_err = 1 - ensemble.score(X_train, Y_train) end = time() print(("\nTraining took {:.4f} ").format(end - start)) start = time() test_err = 1 - ensemble.score(X_test, Y_test) end = time() print(("\nTesting took {:.4f} ").format(end - start)) print "Train err: {:.4f}", train_err
# feature_worth(models['GradientBoostingRegressor'], train) # # > Example: Compare validation error between models # for model_key in models: # print "> "+model_key # model = models[model_key] # if "NeuralNetwork" in model_key: # error = cv(train_, 5, model) # else: # error = cv(train, 5, model) # print model_key, error del models['NeuralNetwork'] del models['NeuralNetwork_w_Momentum'] ensemble = Ensemble( models=models, combiner=ensemble.GradientBoostingRegressor( n_estimators=100, random_state=0 ) ) ensemble.fit(train) d = dict( zip( models['GradientBoostingRegressor'].variables['casual'], ensemble.feature_importances_ ) ) prediction = ensemble.predict(test) output('12.0th.csv', prediction, test)