Beispiel #1
0
    def evaluateModel(self, mdl):

        y_pred = mdl.predict(self.x_test)

        r_test = pearson(self.y_test, y_pred)
        rho_test = spearman(self.y_test, y_pred)
        rmse_test = rmse(self.y_test, y_pred)
        ci_test = ci(self.y_test, y_pred)
        auc_test = average_AUC(self.y_test, y_pred)

        y_pred_ext = mdl.predict(self.x_ext)

        r_ext = pearson(self.y_ext, y_pred_ext)
        rho_ext = spearman(self.y_ext, y_pred_ext)
        rmse_ext = spearman(self.y_ext, y_pred_ext)
        ci_ext = ci(self.y_ext, y_pred_ext)
        auc_ext = AUC(self.y_ext, y_pred_ext)

        print('Test Set Results')
        print(
            f'r_test: {r_test:.3f}, rho_test: {rho_test:.3f}, rmse_test: {rmse_test:.3f}, \
                ci_test: {ci_test:.3f} auc_test: {auc_test:.3f}')

        print('Metz dataSet results')
        print(
            f'r_ext: {r_ext:.3f}, rho_ext: {rho_ext:.3f}, rmse_ext {rmse_ext:.3f}, \
                ci_ext: {ci_ext:.3f}, auc_ext: {auc_ext:.3f}')
Beispiel #2
0
def external_set(external_data):
    """
    This function is used to evaluate the model on external data set
    
    Parameters
    ----------
    Features : np.array
        Protein and ligand features are concatenated and used as input file
    
    Output
    ------
        Prints model performance  on external set in various evaluation metrics
    """
    ext_data = np.load(external_data)
    x_ext = ext_data[:, :-1]
    y_ext = ext_data[:,-1:].ravel()
    model_name = MODEL_DIR+"/xgb.mdl"
    model = joblib.load(model_name)
    y_pred_ext = model.predict(x_ext)
    
    print('external set is getting evaluated')
    
    PEARSON_R = pearson(y_ext, y_pred_ext)
    SPEARMAN_R = spearman(y_ext, y_pred_ext)
    RMSE = rmse(y_ext, y_pred_ext)
    Conc_Index = ci(y_ext,y_pred_ext)
    auc = AUC(y_ext, y_pred_ext)
    
    print("PEARSON_R {:0.3f}: ".format(PEARSON_R))
    print("SPEARMAN_R {:0.3f}: ".format(SPEARMAN_R))
    print("RMSE {:0.3f}: ".format(RMSE))
    print("Conc_Index {:0.3f}: ".format(Conc_Index))
    print("Avg_AUC {:0.3f}: ".format(auc))
Beispiel #3
0
def train_test(dataset):
    """
    This function is used to trains the model by grid search method and evaluates the test set
    
    Parameters
    ----------
    Features : np.array
        Protein and ligand features are concatenated and used as input file
    
    Output
    ------
        Saves the best model and also prints its performance in various evaluation metrics
    """
    
    data_set = np.load(dataset)
    X = data_set[:, :-1]
    y = data_set[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
#    x_train = np.load('x_train.npy')
#    y_train = np.load('y_train.npy')
#    x_test = np.load('x_test.npy')
#    y_test = np.load('y_test.npy')
    
    param_grid = {'n_estimators':[1000, 500, 100], 'objective':['reg:linear'],
                  'colsample_bytree':[0.3, 0.6, 1.0], 'learning_rate':[0.1, 0.001, 0.005, 1.0],\
                  'subsample':[0.8, 1.0], 'max_depth':[3, 5, 10 ], 'alpha':[0,10], 'gamma':[0, 1, 5]}
    xgbr = xgb.XGBRegressor()
#    xgbr = RandomizedSearchCV(estimator = xgbr, param_distributions = param_grid, n_iter = 10, cv = 5)
    xgbr = GridSearchCV(estimator=xgbr, param_grid=param_grid, cv= 5)
    
    print('XGBoost mdl fitting is started')
    
    xgbr=xgbr.fit(x_train, y_train)
    print(xgbr.best_params_)
    best_model = xgbr.best_estimator_
    model_name = MODEL_DIR+"/xgb.mdl"
    joblib.dump(best_model, model_name)
    y_pred = best_model.predict(x_test)
    
    print('XGBoost model is saved')
    
    PEARSON_R = pearson(y_test, y_pred)
    SPEARMAN_R = spearman(y_test, y_pred)
    RMSE = rmse(y_test, y_pred)
    Conc_Index = ci(y_test,y_pred)
    Avg_AUC = average_AUC(y_test, y_pred)
    
    print("PEARSON_R{:0.3f}: ".format(PEARSON_R))
    print("SPEARMAN_R{:0.3f}: ".format(SPEARMAN_R))
    print("RMSE{:0.3f}: ".format(RMSE))
    print("Conc_Index{:0.3f}".format(Conc_Index))
    print("Avg_AUC{:0.3f}".format(Avg_AUC))
Beispiel #4
0
def train_test(dataset):
    """
    This function is used to trains the model by grid search method and evaluates the test set
    
    Parameters
    ----------
    Features : np.array
        Protein and ligand features are concatenated and used as input file
    
    Output
    ------
        Saves the best model and also prints its performance in various evaluation metrics
    """
    
    data_set = np.load(dataset)
    x = data_set[:, :-1]
    y = data_set[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)
#    x_train = np.load('x_train.npy')
#    y_train = np.load('y_train.npy')
#    x_test = np.load('x_test.npy')
#    y_test = np.load('y_test.npy')
    rfr = RandomForestRegressor(n_jobs=-1)
    param_grid={'n_estimators':[50,100,200,400,600,800], 'max_features': ['auto','sqrt','log2',None],\
                'min_samples_split':[2, 5, 10]}
    print "Starting model fitting"
    #rfr = RandomizedSearchCV(estimator=rfr, param_distributions=param_grid, n_iter=10, cv= 5)
    rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
    
    rfr=rfr.fit(x_train, y_train)
    print rfr.best_params_
    best_model = rfr.best_estimator_
    model_name = MODEL_DIR+"/rfr.mdl"
    joblib.dump(best_model, model_name)
    y_pred = best_model.predict(x_test)
#    r2=r2_score(y_test, y_pred)
    
    print "RFR model is saved"
    
    PEARSON_R = pearson(y_test, y_pred)
    SPEARMAN_R = spearman(y_test, y_pred)
    RMSE = rmse(y_test, y_pred)
    Conc_Index = ci(y_test,y_pred)
    Avg_AUC = average_AUC(y_test, y_pred)
    
    print "PEARSON_R {:0.3f}: ".format(PEARSON_R)
    print "SPEARMAN_R {:0.3f}: ".format(SPEARMAN_R)
    print "RMSE {:0.3f}: ".format(RMSE)
    print "Conc_Index {:0.3f}: ".format(Conc_Index)
    print "Avg_AUC {:0.3f}: ".format(Avg_AUC)
Beispiel #5
0
                                  random_state=1,
                                  verbose=False)
    model.fit(train_x, train_y)
    joblib.dump(model, os.path.join(MODEL_DIR, 'model_' + str(i + 1)))

    #train_predict = model.predict(train_x)
    #train_r2 = r2_score(y_pred=train_predict, y_true=train_y)
    test_predict = model.predict(test_x)
    #test_r2 = r2_score(y_pred=test_predict, y_true=test_y)
    #print("  TRAIN R2: {:.2}, TEST R2: {:.2}".format(train_r2, test_r2))

    RMSE = rmse(test_y, test_predict)
    rmse_list.append(RMSE)
    PEARSON = pearson(test_y, test_predict)
    pearson_list.append(PEARSON)
    SPEARMAN = spearman(test_y, test_predict)
    spearman_list.append(SPEARMAN)
    F1 = f1(test_y, test_predict)
    f1_list.append(F1)
    CI = ci(test_y, test_predict)
    ci_list.append(CI)
    AVG_AUC = average_AUC(test_y, test_predict)
    auc_list.append(AVG_AUC)

    print(
        "RMSE: {:.2f} PEARSON: {:.2f} SPEARMAN: {:.2f}, F1: {:.2f}, CI: {:.2f}, AVG AUC: {:.2f}"
        .format(RMSE, PEARSON, SPEARMAN, F1, CI, AVG_AUC))

print(
    "MEAN RMSE: {:.2}, PEARSON: {:.2}, SPEARMAN: {:.2}, F1: {:.2}, CI: {:.2}, AVG AUC: {:.2}"
    .format(
 def test_spearman(self):
     self.assertEqual(ev.spearman(self.actual, self.predicted),
                      0.37569026743498013)

args = parser.parse_args()

if __name__ == '__main__':
    
    
    if args.status == "VALIDATED":
        sub_df = pd.read_csv(args.submission_file)
        gs_df = pd.read_csv(args.goldstandard_file)
        combined_df = pd.merge(sub_df, gs_df, how='inner')
        actual = combined_df["pKd_[M]"]
        predicted = combined_df["pKd_[M]_pred"]
        
        rmse = ev.rmse(actual, predicted)
        spearman = ev.spearman(actual, predicted)
        average_auc = ev.average_AUC(actual, predicted)
        
        rounded_rmse = round(rmse, 3)
        rounded_spearman = round(spearman, 3)
        rounded_average_auc = round(average_auc, 3)
        
        result = {
            "prediction_file_status":"SCORED",
            "rmse": rmse,
            "spearman": spearman,
            "average_auc": average_auc,
            "rounded_rmse": rounded_rmse,
            "rounded_spearman": rounded_spearman,
            "rounded_average_auc": rounded_average_auc}
            
def get_scores(labels,
               predictions,
               validation_test,
               total_training_loss,
               total_validation_test_loss,
               epoch,
               fold_epoch_results,
               fold=None):
    score_dict = {
        "rm2": None,
        "CI (DEEPDTA)": None,
        "MSE": None,
        "RMSE": None,
        "Pearson": None,
        "Spearman": None,
        "CI (Challenge)": None,
        "Average AUC": None,
        "Precision 5.0": None,
        "Recall 5.0": None,
        "F1-Score 5.0": None,
        "Accuracy 5.0": None,
        "MCC 5.0": None,
        "Precision 6.0": None,
        "Recall 6.0": None,
        "F1-Score 6.0": None,
        "Accuracy 6.0": None,
        "MCC 6.0": None,
        "Precision 7.0": None,
        "Recall 7.0": None,
        "F1-Score 7.0": None,
        "Accuracy 7.0": None,
        "MCC 7.0": None,
    }

    score_dict = {
        "rm2": None,
        "CI (DEEPDTA)": None,
        "MSE": None,
        "RMSE": None,
        "Pearson": None,
        "Spearman": None,
        "CI (Challenge)": None,
        "Average AUC": None,
        "Precision 10uM": None,
        "Recall 10uM": None,
        "F1-Score 10uM": None,
        "Accuracy 10uM": None,
        "MCC 10uM": None,
        "Precision 1uM": None,
        "Recall 1uM": None,
        "F1-Score 1uM": None,
        "Accuracy 1uM": None,
        "MCC 1uM": None,
        "Precision 100nM": None,
        "Recall 100nM": None,
        "F1-Score 100nM": None,
        "Accuracy 100nM": None,
        "MCC 100nM": None,
        "Precision 30nM": None,
        "Recall 30nM": None,
        "F1-Score 30nM": None,
        "Accuracy 30nM": None,
        "MCC 30nM": None,
    }
    score_list = get_list_of_scores()

    score_dict["rm2"] = get_rm2(np.asarray(labels), np.asarray(predictions))
    score_dict["CI (DEEPDTA)"] = get_cindex(np.asarray(labels),
                                            np.asarray(predictions))
    score_dict["MSE"] = mse(np.asarray(labels), np.asarray(predictions))
    score_dict["RMSE"] = rmse(np.asarray(labels), np.asarray(predictions))
    score_dict["Pearson"] = pearson(np.asarray(labels),
                                    np.asarray(predictions))
    score_dict["Spearman"] = spearman(np.asarray(labels),
                                      np.asarray(predictions))
    score_dict["CI (Challenge)"] = ci(np.asarray(labels),
                                      np.asarray(predictions))
    score_dict["Average AUC"] = average_AUC(np.asarray(labels),
                                            np.asarray(predictions))

    prec_rec_f1_acc_mcc_threshold_dict = prec_rec_f1_acc_mcc(
        np.asarray(labels), np.asarray(predictions))
    for key in prec_rec_f1_acc_mcc_threshold_dict.keys():
        score_dict[key] = prec_rec_f1_acc_mcc_threshold_dict[key]
    """
    lst_calculated_scores = []
    for scr in score_list:
        lst_calculated_scores.append(score_dict[scr])
    """

    if fold != None:
        fold_epoch_results[-1].append(score_dict)
        print("Fold:{}\tEpoch:{}\tTraining Loss:{}\t{} Loss:{}".format(
            fold + 1, epoch, total_training_loss, validation_test,
            total_validation_test_loss))
    else:
        fold_epoch_results.append(score_dict)
        print("Epoch:{}\tTraining Loss:{}\t{} Loss:{}".format(
            epoch, total_training_loss, validation_test,
            total_validation_test_loss))
    for scr in score_list:
        print("{} {}:\t{}".format(validation_test, scr, score_dict[scr]))
    """
    print("{} RM2:\t{}".format(validation_test, deep_dta_rm2))
    print("{} MSE\t{}".format(validation_test, deep_dta_mse))
    print("{} RMSE\t{}".format(validation_test, rmse_score))
    print("{} c-index\t{}".format(validation_test, deep_dta_cindex))
    print("{} Pearson:\t{}".format(validation_test, pearson_score))
    print("{} Spearman:\t{}".format(validation_test, spearman_score))
    print("{} Ci:\t{}".format(validation_test, ci_score))
    print("{} Average_AUC:\t{}".format(validation_test, ave_auc_score))

    for key in prec_rec_f1_acc_mcc_threshold_dict.keys():
        
    """
    return score_dict