Example #1
0
def run_xgboost_classifier(load=False, model_no=1):
    # para load: whether or not to load pre-trained model
    # para model_no: if load, which model to load
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    if load:
        print("\nload pre-trained no.%d model" % model_no)
        xgb_model = load_model(model_no)
    else:
        print("\ntraining a XGBoost classifier ...")
        xgb_model = xgb.XGBClassifier(min_child_weight=MIN_CHILD_WEIGHT,
                                      gamma=G,
                                      subsample=SUBSAMPLE,
                                      n_estimators=NO_TREES,
                                      max_depth=MAX_DEPTH)
        xgbt_name = "min_child_weight=%s,gamma=%s,subample=%s,n_estimators=%s,max_depth=%s" % (
            str(MIN_CHILD_WEIGHT), str(G), str(SUBSAMPLE), str(NO_TREES),
            str(MAX_DEPTH))

        xgb_model.fit(X_train, y_train)

        print("\ntraining DONE. \n\nsaving the XGBoost classifer ...")
        save_model(xgb_model, xgbt_name)

    print("\ntesting the XGBoost classifier ...")
    y_pred = xgb_model.predict(X_test)
    print(mean_squared_error(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    # show & save the top 50 important features
    show_important_feature(xgb_model, data_RNASeq, img=False)
    # draw the precision recall curve for the classifier
    draw_precision_recall_curve(y_test, y_pred)
def run_random_forest(load=False, model_no=1):
    # para load: whether or not to load pre-trained model
    # para model_no: if load, which model to load
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    if load:
        print("\nload pre-trained no.%d model" % model_no)
        forest = load_model(model_no)
    else:
        print("\ntraining a Random Forest classifier ...")
        forest = RandomForestClassifier(n_estimators=NO_TREES,
                                        random_state=0,
                                        max_features=MAX_FEATURES,
                                        criterion=CRITERION,
                                        n_jobs=NO_JOBS)
        forest_name = "n_estimators=%s,max_features=%s,criterion=%s,n_jobs=%s" % (
            NO_TREES, MAX_FEATURES, CRITERION, NO_JOBS)

        forest.fit(X_train, y_train)

        print("\ntraining DONE.\n\nsaving the RF classifier ...")
        save_model(forest, forest_name)

    print("\ntesting the Random Forest classifier ...\n")
    y_pred = forest.predict(X_test)
    print("Accuracy on training set: %.3f" % forest.score(X_train, y_train))
    print("Accuracy on test set: %.3f" % forest.score(X_test, y_test))

    # show & save the top 50 important features
    show_important_feature(forest, data_RNASeq, img=False)
    # draw the precision recall curve for the classifier
    draw_precision_recall_curve(y_test, y_pred)
def run_gradient_boost(load=False, model_no=1):
    # para load: whether or not to load pre-trained model
    # para model_no: if load, which model to load
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    if load:
        print("\nload pre-trained no.%d model" % model_no)
        gbrt = load_model(model_no)
    else:
        print("\ntraining a Gradient Boosting Tree classifier ...")
        gbrt = GradientBoostingClassifier(n_estimators=NO_TREES,
                                          random_state=0,
                                          max_features=MAX_FEATURES,
                                          max_depth=MAX_DEPTH,
                                          learning_rate=LEARNING_RATE)
        gbrt_name = "n_estimators=%s,max_features=%s,max_depth=%s,learning_rate=%s" % (
            str(NO_TREES), MAX_FEATURES, str(MAX_DEPTH), str(LEARNING_RATE))

        gbrt.fit(X_train, y_train)

        print("\ntraining DONE.\n\nsaving the GB classifier ...")
        save_model(gbrt, gbrt_name)

    print("\ntesting the Gradient Boosting Tree classifier ...\n")
    y_pred = gbrt.predict(X_test)
    print("Accuracy on training set: %.3f" % gbrt.score(X_train, y_train))
    print("Accuracy on test set: %.3f" % gbrt.score(X_test, y_test))

    # show & save the top 50 important features
    show_important_feature(gbrt, data_RNASeq, img=False)
    # draw the precision recall curve for the classifier
    draw_precision_recall_curve(y_test, y_pred)
def tune_hyperparameters():
    # load the data
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    parameters = {
        "n_estimators": [200, 400, 500, 600, 800],
        "max_features": ['log2', 'sqrt'],
        "max_depth": [3, 5, 8],
        "learning_rate": [0.05, 0.1, 0.2]
    }

    print(
        "\nrunning the Grid Search for Gradient Boosting Tree classifier ...")
    clf = GridSearchCV(GradientBoostingClassifier(),
                       parameters,
                       cv=2,
                       n_jobs=NO_JOBS,
                       verbose=10)

    clf.fit(X_train, y_train)
    print(clf.score(X_train, y_train))
    print(clf.best_params_)

    # save tune hyperparameters
    with smart_open("./results/best_params_gradient_boost.txt",
                    'w',
                    encoding='utf-8') as f:
        f.write(str(clf.best_params_) + str(clf.best_score_))
    print("\nbest hyperparameters for GBRT has been written to file.")
Example #5
0
def tune_hyperparameters():
    # loda the data
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    params = {
        "min_child_weight": [1, 5, 10],
        "gamma": [0.5, 1, 2],
        "subsample": [0.6, 0.8, 1.0],
        "max_depth": [3, 5],
        "n_estimators": [50, 200]
    }

    print("\nrunning the Grid Search for XGBoost classifier ...")
    clf = GridSearchCV(xgb.XGBClassifier(),
                       params,
                       cv=2,
                       n_jobs=NO_JOBS,
                       verbose=10)

    clf.fit(X_train, y_train)
    print(clf.best_score_)
    print(clf.best_estimator_)

    # save tuned hyperparameters
    with smart_open("./results/best_params_xgboost.txt", 'w',
                    encoding='utf-8') as f:
        f.write(str(clf.best_params_) + str(clf.best_score_))
    print("\nbest hyperparameters for XGBOOST has been written to file.")
def tune_hyperparameters():
    # load the data
    data_RNASeq_labels = load_data_RNASeq(proc=False,
                                          label=False,
                                          raw_count=True)
    print(data_RNASeq_labels.iloc[1:3])
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    data_labels = data_RNASeq_labels['label']
    data_RNASeq = data_RNASeq_labels.drop(columns=['label'])

    # train/test split
    print("\nsplitting the training/test dataset ...")
    X_train, X_test, y_train, y_test = train_test_split(
        data_RNASeq, data_labels)

    parameters = {
        "n_estimators": [100, 200, 300, 400, 500],
        "max_features": [0.1, 0.2, 0.25, 0.3, 0.4, 0.5],
        "criterion": ["entropy"]
    }

    print("\nrunning the Grid Search for Random Forest classifier ...")
    clf = GridSearchCV(RandomForestClassifier(),
                       parameters,
                       cv=2,
                       n_jobs=NO_JOBS,
                       verbose=10)

    clf.fit(X_train, y_train)
    print(clf.score(X_train, y_train))
    print(clf.best_params_)

    # save tuned hyperparameters
    with smart_open("./results/best_params_random_forest.txt",
                    'w',
                    encoding='utf-8') as f:
        f.write(str(clf.best_params_) + str(clf.best_score_))
    print("\nbest hyperparameters for RF has been written to file.")
def survival_analysis_with_all_RNASeq(model_type):
    # para model_type:
    data_RNASeq_labels = load_data_RNASeq()
    data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene'])

    feature_list = []  # list of gene signatures
    # load most important features (index and name)
    if model_type == 'rf':
        for line in smart_open(IMPORTANT_FEATURE_RANDOM_FOREST,
                               'r',
                               encoding='utf-8'):
            line = line.split()
            feature_list.append((line[2], line[3]))
    elif model_type == 'gbrt':
        for line in smart_open(IMPORTANT_FEATURE_GRADIENT_BOOST,
                               'r',
                               encoding='utf-8'):
            line = line.split()
            feature_list.append((line[2], line[3]))
    elif model_type == 'xgbt':
        for line in smart_open(IMPORTANT_FEATURE_XGBOOST,
                               'r',
                               encoding='utf-8'):
            line = line.split()
            feature_list.append((line[2], line[3]))
    else:
        print(
            "\nPlease indicate the type of model you have trained to produce important features"
        )

    log_p_values = []
    for i in range(len(feature_list)):
        log_p_values.append(
            survival_analysis_with_one_RNASeq(model_type, data_RNASeq_labels,
                                              feature_list, i))

    print(log_p_values)
    save_log_p_values(model_type, log_p_values)