Ejemplo n.º 1
0
def test_fit_predict_ensemble(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state)(x, y_t)
    y2 = RandomForestClassifier(random_state=random_state)(x, y_t)
    features = Stack(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_pred = logreg.predict(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state)
    random_forest.fit(x_data, y_t_data)
    random_forest_pred = random_forest.predict(x_data)

    features = np.stack([logreg_pred, random_forest_pred], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Ejemplo n.º 2
0
def train_ensemble(allele_table_path,
                   amr_path,
                   antibiotic,
                   gene_path=None,
                   core_cutoff=10,
                   bootstrap_instances=0.8,
                   bootstrap_features=0.5,
                   num_models=500,
                   log_iter=50):
    ''' 
    Trains a random subspace ensemble of SVMs to predict AMR from the allele 
    and/or gene content of a strain for a given antibiotic.

    Parameters
    ----------
    allele_table_path : str
        Path to binary table containing allelic content of strains as CSV, where
        rows = alleles, columns = strains. Will interpret blanks as 0s.
    amr_path : str
        Path to table containing AMR profiles of strains as CSV, where rows = strains, 
        columns = drugs. Will convert "Resistant" and "Intermediate" to 1s, 
        "Susceptible" to 0s, and ignore blanks.
    antibiotic : str
        Drug to train model for, must be in columns of amr_path
    gene_path : str
        Similar to allele_table_path, but for gene content. If provided, will
        select model non-core alleles as defined by core_cutoff at the gene level,
        rather than each allele individually (default None)
    bootstrap_instances : float
        Fraction of total strains sampled for training, must be <1.0 (default 0.8)
    bootstrap_features : float
        Fraction of total genetic features sampled for training, uses all if 1.0 (default 0.5)
    num_models : int
        Number of individual models to train for ensemble (default 500)
    log_iter : int
        Print a message after this many models have been trained (default 50)

    Returns
    -------
    ensemble : RSE
        Trained RSE object
    df_features : DataFrame
        DataFrame of binary matrix encoding genetic features of each strain
    df_amr : DataFrame
        DataFrame of binary vector encoding AMR phenotypes of each strain
    '''
    ''' Reduce data to strains with AMR data for selected antibiotic '''
    df_features, df_amr = __prepare_amr_data__(allele_table_path,
                                               amr_path,
                                               antibiotic,
                                               gene_path=gene_path,
                                               core_cutoff=core_cutoff)
    ''' Train and return ensemble '''
    ensemble = RSE(num_models=num_models,
                   bootstrap_instances=bootstrap_instances,
                   bootstrap_features=bootstrap_features)
    ensemble.fit(df_features.values, df_amr.values)
    return ensemble, df_features, df_amr
Ejemplo n.º 3
0
def test_fit_predict_ensemble_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t)
    y2 = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state, function="apply"
    )(x, y_t)
    features = Concatenate(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Ejemplo n.º 4
0
#knn = KNeighborsClassifier(n_neighbors=5)
#knn.fit(train_data, train_targets)
#print('KNN Score: ', knn.score(test_data, test_targets))

ext_forest = ExtraTreesClassifier(n_estimators=200, n_jobs=-1)
ext_forest.fit(train_data, train_targets)
print('ExtraTree Score: ', ext_forest.score(test_data, test_targets))

#ext_forest_predicted = ext_forest.predict(test_data)
#print_metrics(test_targets, ext_forest_predicted)

randFor_class = ensemble.RandomForestClassifier(n_estimators=200,
                                                random_state=1)
randFor_class.fit(train_data, train_targets)
score = randFor_class.score(test_data, test_targets)

print('RandomForest Score: ', randFor_class.score(test_data, test_targets))
#randFor_pred = ext_forest.predict(test_data)
#print_metrics(test_targets, randFor_pred)

ensemble = VotingClassifier(estimators=[('RandomForest', randFor_class),
                                        ('ExtraTrees', ext_forest)],
                            voting='soft')
ensemble.fit(train_data, train_targets)
print('Ensamble Score: ', ensemble.score(test_data, test_targets))

#ensemble_predicted = ensemble.predict(test_data)
#print_metrics(test_targets, ensemble_predicted)

save_classifier(ext_forest, file_name='best_classifier')
Ejemplo n.º 5
0
breast_cancer = datasets.load_breast_cancer()

offset = int(0.6*len(breast_cancer.data))
X_train = breast_cancer.data[:offset]
Y_train = breast_cancer.target[:offset]
X_test = breast_cancer.data[offset:]
Y_test = breast_cancer.target[offset:]

# Setup a Decision Tree Classifier so that it learns a tree with depth d
classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=2, max_leaf_nodes=15, criterion='entropy', min_samples_leaf=1)
ensemble = ensemble.AdaBoostClassifier(classifier, n_estimators=15, learning_rate=.5)

start = time()
# Fit the learner to the training data
ensemble.fit(X_train, Y_train)
end = time()
print(("\nLearner took {:.4f} ").format(end - start))


# Find the MSE on the training set
start = time()
train_err = 1 - ensemble.score(X_train, Y_train)
end = time()
print(("\nTraining took {:.4f} ").format(end - start))

start = time()
test_err = 1 - ensemble.score(X_test, Y_test)
end = time()
print(("\nTesting took {:.4f} ").format(end - start))
Ejemplo n.º 6
0
X_train = preprocessing.normalize(dft.ix[:offset, 1:])
Y_train = dft.ix[:offset, 0]
X_test = preprocessing.normalize(dft.ix[offset:, 1:])
Y_test = dft.ix[offset:, 0]

# Setup a Decision Tree Classifier so that it learns a tree with depth d
classifier = tree.DecisionTreeClassifier(criterion='entropy',
                                         max_depth=20,
                                         min_samples_leaf=1)
ensemble = ensemble.AdaBoostClassifier(classifier,
                                       n_estimators=1,
                                       learning_rate=1.)

start = time()
# Fit the learner to the training data
ensemble.fit(X_train, Y_train)
end = time()
print(("\nLearner took {:.4f} ").format(end - start))

# Find the MSE on the training set
start = time()
train_err = 1 - ensemble.score(X_train, Y_train)
end = time()
print(("\nTraining took {:.4f} ").format(end - start))

start = time()
test_err = 1 - ensemble.score(X_test, Y_test)
end = time()
print(("\nTesting took {:.4f} ").format(end - start))

print "Train err: {:.4f}", train_err
Ejemplo n.º 7
0
    # feature_worth(models['GradientBoostingRegressor'], train)

    # # > Example: Compare validation error between models
    # for model_key in models:
    #     print "> "+model_key
    #     model = models[model_key]
    #     if "NeuralNetwork" in model_key:
    #         error = cv(train_, 5, model)
    #     else:
    #         error = cv(train, 5, model)
    #     print model_key, error

    del models['NeuralNetwork']
    del models['NeuralNetwork_w_Momentum']
    ensemble = Ensemble(
        models=models,
        combiner=ensemble.GradientBoostingRegressor(
            n_estimators=100,
            random_state=0
        )
    )
    ensemble.fit(train)
    d = dict(
        zip(
            models['GradientBoostingRegressor'].variables['casual'],
            ensemble.feature_importances_
        )
    )
    prediction = ensemble.predict(test)
    output('12.0th.csv', prediction, test)