Ejemplo n.º 1
0
def naive_bayes_k(k, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1,
                  selected_primer: str = 'V4',
                  model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2):
    """
    Apply Naive Bayes model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)
    X_train, X_test, y_train, y_test = ETL_NB_k_mer(k=k,
                                                    sequence_origin=sequence_origin,
                                                    primers_origin=primers_origin,
                                                    taxonomy_level=taxonomy_level,
                                                    selected_primer=selected_primer)
    GNB = GaussianNB()
    y_pred = GNB.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(y_train=y_train,
                                                            y_test=y_test,
                                                            y_pred=y_pred,
                                                            model_name='Naive Bayes - NB({})'.format(k),
                                                            model_parameters=GNB.get_params(),
                                                            model_preprocessing=model_preprocessing,
                                                            sequence_origin=sequence_origin,
                                                            primers_origin=primers_origin,
                                                            taxonomy_level=taxonomy_level,
                                                            selected_primer=selected_primer,
                                                            test_size=test_size)

    return test_size, prop_main_class, accuracy
Ejemplo n.º 2
0
def random_forest_k_default(
        k=4,
        sequence_origin='DairyDB',
        primers_origin='DairyDB',
        taxonomy_level: int = 1,
        selected_primer: str = 'V4',
        model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence',
        test_size=0.2,
        max_depth=None,
        n_estimators=None):
    """
    Apply Random Forest model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)
    X_train, X_test, y_train, y_test = ETL_RF_k_mer(
        k=k,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer)
    if max_depth is None:
        if taxonomy_level >= 5:
            max_depth = 10
        elif taxonomy_level >= 3 and selected_primer == 'sequence' and sequence_origin == '':
            max_depth = 20
        else:
            max_depth = 50
    if n_estimators is None:
        n_estimators = 200
    RF = RandomForestClassifier(bootstrap=False,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                max_features=min(50, 4**k),
                                n_estimators=n_estimators,
                                max_depth=max_depth,
                                n_jobs=-1)  # 30 for max_depth is not backed-up
    y_pred = RF.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        model_name='RF_{}'.format(k),
        model_parameters=RF.get_params(),
        model_preprocessing=model_preprocessing,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer,
        test_size=test_size,
        k=k,
        feature_importances=RF.feature_importances_)

    return RF, test_size, prop_main_class, accuracy
Ejemplo n.º 3
0
def xgboost_k_grid_search_cv(
        k=4,
        param_grid=None,
        sequence_origin='DairyDB',
        primers_origin='DairyDB',
        taxonomy_level: int = 1,
        selected_primer: str = 'V4',
        model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence',
        test_size=0.2):
    """
    Apply Random Forest model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)

    X_train, X_test, y_train, y_test = ETL_k_mer(
        k=k,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer)

    XGB = XGBClassifier()

    grid_search = GridSearchCV(estimator=XGB,
                               param_grid=param_grid,
                               cv=3,
                               n_jobs=8,
                               verbose=2)
    grid_search.fit(X_train, y_train)
    XGB_opt = grid_search.best_estimator_
    y_pred = XGB_opt.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        model_name='XGB_CV_{}'.format(k),
        model_parameters=grid_search.best_params_,
        model_preprocessing=model_preprocessing,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer,
        test_size=test_size,
        feature_importances=XGB_opt.feature_importances_,
        k=k,
        save_csv=True,
        xgb_model=XGB_opt,
        save_model=True)

    return test_size, prop_main_class, accuracy
Ejemplo n.º 4
0
def xgboost_k_default(
        k=4,
        sequence_origin='DairyDB',
        primers_origin='DairyDB',
        taxonomy_level: int = 1,
        selected_primer: str = 'V4',
        model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence',
        test_size=0.2):
    """
    Apply Random Forest model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)
    X_train, X_test, y_train, y_test = ETL_k_mer(
        k=k,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer)

    XGB = XGBClassifier(silent=0, eta=0.3, max_depth=3, n_estimators=100)
    y_pred = XGB.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        model_name='XGB_{}'.format(k),
        model_parameters=XGB.get_params(),
        model_preprocessing=model_preprocessing,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer,
        test_size=test_size,
        k=k,
        feature_importances=XGB.feature_importances_,
        xgb_model=XGB,
        save_model=True,
        save_tree=20)

    del XGB, X_train, X_test, y_train, y_test, y_pred

    return test_size, prop_main_class, accuracy
Ejemplo n.º 5
0
def random_forest_k_grid_search_cv(
        k=5,
        param_grid=None,
        sequence_origin='DairyDB',
        primers_origin='DairyDB',
        taxonomy_level: int = 1,
        selected_primer: str = 'V4',
        model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence',
        test_size=0.2):
    """
    Apply Random Forest model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)
    if param_grid is None:
        # Number of trees in random forest
        n_estimators = [200]  # Checked as often the best option
        # Number of features to consider at every split
        max_features = ['auto']  # Checked as best option
        # Maximum number of levels in tree
        max_depth = [
            None
        ]  # Checked as best option -> Due to memory errors, limiting at 30
        # Minimum number of samples required to split a node
        min_samples_split = [
            2
        ]  # Instead of 2, 5, 10 because of unbalanced classes
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [
            1
        ]  # Instead of 1, 2, 4 because of unbalanced classes
        # Method of selecting samples for training each tree
        bootstrap = [False]  # Checked as best option
        # Create the random grid
        param_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'boostrap': bootstrap
        }

    X_train, X_test, y_train, y_test = ETL_RF_k_mer(
        k=k,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer)

    RF = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=RF,
                               param_grid=param_grid,
                               cv=3,
                               n_jobs=2,
                               verbose=1)
    grid_search.fit(X_train, y_train)
    RF_opt = grid_search.best_estimator_
    y_pred = RF_opt.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        model_name='RF_CV_{}'.format(k),
        model_parameters=grid_search.best_params_,
        model_preprocessing=model_preprocessing,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer,
        test_size=test_size,
        feature_importances=RF_opt.feature_importances_,
        save_model=True,
        rf_model=RF_opt,
        k=k,
        save_csv=True)

    return test_size, prop_main_class, accuracy