Exemple #1
0
def print_metrics(model, train_dataset, test_dataset, train_result):
    model.train(False)

    test_preds = train_utils.get_preds(test_dataset.data[:, 1:], model)
    test_AUC = train_utils.compute_AUC(test_dataset.data[:, :1], test_preds)
    test_PRAUC = train_utils.compute_PRAUC(test_dataset.data[:, :1], test_preds)

    test_accuracy = train_utils.compute_accuracy(test_dataset.data[:, :1], test_preds)

    test_TP, test_TN, test_FN, test_FP = train_utils.compute_confusion(test_dataset.data[:, :1], test_preds)


    train_preds = train_utils.get_preds(train_dataset.data[:1000, 1:], model)
    train_AUC = train_utils.compute_AUC(train_dataset.data[:1000, :1], train_preds)
    train_PRAUC = train_utils.compute_PRAUC(train_dataset.data[:1000, :1], train_preds)

    train_accuracy = train_utils.compute_accuracy(train_dataset.data[:1000, :1], train_preds)

    train_result.test_AUC_list.append("%.04f" % test_AUC)
    train_result.test_PRAUC_list.append("%.04f" % test_PRAUC)
    train_result.test_accuracy_list.append("%.04f" % test_accuracy)

    train_result.test_TP_list.append("%.04f" %test_TP)
    train_result.test_TP_list.append("%.04f" %test_TN)
    train_result.test_TP_list.append("%.04f" %test_FN)
    train_result.test_TP_list.append("%.04f" %test_FP)


    return train_AUC, test_AUC, test_PRAUC, train_accuracy, test_accuracy, test_preds, test_TP, test_TN, test_FN, test_FP
Exemple #2
0
def graph():

    test_dataset1 = Dataset(split = 9, fold = 10, phase="test", filename = "top10_no_space.csv", use_data_dropout=False)
    test_dataset2 = Dataset(split = 9, fold = 10, phase="test", filename = "top20_no_space.csv", use_data_dropout=False)
    test_dataset3 = Dataset(split = 9, fold = 10, phase="test", filename = "emr_no_space.csv", use_data_dropout=False)
    test_dataset4 = Dataset(split = 9, fold = 10, phase="test", filename = "medical_data_6_no_space.csv", use_data_dropout=False)


    loadpath1 = "/content/drive/My Drive/research/frontiers/checkpoints/graph/top10.pt"
    loadpath2 = "/content/drive/My Drive/research/frontiers/checkpoints/graph/top20.pt"
    loadpath3 = "/content/drive/My Drive/research/frontiers/checkpoints/graph/emr.pt"
    loadpath4 = "/content/drive/My Drive/research/frontiers/checkpoints/graph/medical_data_6.pt"


    model1 = torch.load(loadpath1)
    model2 = torch.load(loadpath2)
    model3 = torch.load(loadpath3)
    model4 = torch.load(loadpath4)

    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()

    pred1 = train_utils.get_preds(test_dataset1.data[:, 1:], model1)
    pred2 = train_utils.get_preds(test_dataset2.data[:, 1:], model2)
    pred3 = train_utils.get_preds(test_dataset3.data[:, 1:], model3)
    pred4 = train_utils.get_preds(test_dataset4.data[:, 1:], model4)

    plt.figure()
    f, axes = plt.subplots(1, 1, sharex=True, sharey=True)
    f.set_size_inches((8, 8))

    from sklearn.metrics import roc_auc_score, roc_curve

    fpr1, tpr1, _ = roc_curve(test_dataset1.data[:, :1], pred1)
    fpr2, tpr2, _ = roc_curve(test_dataset2.data[:, :1], pred2)
    fpr3, tpr3, _ = roc_curve(test_dataset3.data[:, :1], pred3)
    fpr4, tpr4, _ = roc_curve(test_dataset4.data[:, :1], pred4)

    axes.plot(fpr4, tpr4, label='All variable (AUC 0.870)', color='r')
    axes.plot(fpr2, tpr2, label='Top 20 variables (AUC 0.856)', color='darkgoldenrod')
    axes.plot(fpr1, tpr1, label='Top 10 variables (AUC 0.836)', color='forestgreen')
    axes.plot(fpr3, tpr3, label='Variables related to medical record \n(AUC 0.839)', color='darkviolet')

    axes.plot([0, 1], [0, 1], linestyle="--", color='k')
    axes.set_title("Receiver Operating Characteristic Curve", fontsize=14)
    axes.set_xlabel('False positive rate', fontsize=12)
    axes.set_ylabel('True positive rate', fontsize=12)
    axes.legend(loc="lower right", fontsize=12)

    savepath = "/content/drive/My Drive/research/frontiers/checkpoints/graph/result.tiff"

    plt.savefig(savepath, dpi=300)
    plt.show()
    print(savepath)
Exemple #3
0
def compute_contributing_variables(model, test_dataset):
    print("Evaluating contributing variables")
    model.train(False)
    variable_by_column = np.load("../datasets/sleep1_no_space_columnnames.npy")
    #variable_by_column = np.array([v.replace("HE_ast", "HE_alt") for v in variable_by_column])
    assert variable_by_column.shape[0] == test_dataset.data.shape[1] - 1
    variables = np.unique(variable_by_column)
    AUCs = []
    print("Computing variable contributions")
    print(variables)
    for variable in variables:
        corresponding_indices = (variable_by_column == variable)
        #print("zeroing %s" % str(np.where(corresponding_indices)))
        val_data = test_dataset.data[:, 1:].copy()
        val_data[:, corresponding_indices] = 0.0
        #print((val_data[:, :17] ** 2).mean())
        #val_data = val_data * len(variables) / (len(variables) - 1)
        preds = train_utils.get_preds(val_data, model)
        target = test_dataset.data[:, :1]
        test_AUC = train_utils.compute_AUC(target, preds)
        print("%s %f" % (variable, test_AUC))
        AUCs.append(test_AUC)

    sorting_indices = np.argsort(AUCs)
    sorted_variables = [variables[i] for i in sorting_indices]
    sorted_AUCs = [AUCs[i] for i in sorting_indices]

    sorted_pairs = [(v, auc) for (v, auc) in zip(sorted_variables, sorted_AUCs)]
    for i, (v, auc) in enumerate(sorted_pairs[:20]):
        print("%03d: %s %f" % (i, v, auc))
        
    return [(v, auc) for (v, auc) in zip(variables, AUCs)]
Exemple #4
0
def train_logisticregressoin(info: TrainInformation, split, fold):
    """주어진 split에 대한 학습과 테스트를 진행한다."""
    bs = info.BS
    init_lr = info.INIT_LR
    lr_decay = info.LR_DECAY
    momentum = info.MOMENTUM
    weight_decay = info.WEIGHT_DECAY
    optimizer_method = info.OPTIMIZER_METHOD
    epoch = info.EPOCH
    nchs = info.NCHS
    filename = info.FILENAME
    model_name = info.MODEL_NAME
    exp_name = info.NAME

    print("Using File {}".format(filename))

    train_dataset = Dataset(split=split,
                            fold=fold,
                            phase="train",
                            filename=filename,
                            use_data_dropout=info.USE_DATA_DROPOUT)
    # val_dataset = Dataset(split=split, fold=fold, phase="val", filename=filename)
    test_dataset = Dataset(split=split,
                           fold=fold,
                           phase="test",
                           filename=filename,
                           use_data_dropout=False)

    import sklearn.linear_model

    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=101)
    features, label = smote.fit_resample(train_dataset.train_data[:, 1:],
                                         test_dataset.train_data[:, :1])

    regressor = sklearn.linear_model.LogisticRegression()
    regressor.fit(features, label)
    preds = regressor.predict_proba(test_dataset.data[:, 1:])[:, 1]
    auc = train_utils.compute_AUC(test_dataset.data[:, :1], preds)
    print(auc)
    savepath = "/content/drive/My Drive/research/frontiers/checkpoints/logistic_regression/split_%02d.png" % split
    os.makedirs(os.path.dirname(savepath), exist_ok=True)
    # train_utils.plot_AUC_v2(preds, test_dataset.data[:, :1], savepath=savepath)

    model = get_classifier_model(model_name, train_dataset.feature_size, nchs,
                                 info.ACTIVATION)
    savedir = "/content/drive/My Drive/research/frontiers/checkpoints/%s" % exp_name
    best_test_epoch = 25
    loadpath = "%s/epoch_%04d_fold_%02d.pt" % (savedir, best_test_epoch,
                                               train_dataset.split)
    # model.load_state_dict(torch.load(savepath))
    model = torch.load(loadpath)
    model.eval()

    test_preds = train_utils.get_preds(test_dataset.data[:, 1:], model)
    train_utils.plot_AUC_v2([('Deep Neural Network', test_preds),
                             ('Logistic Regression', preds)],
                            test_dataset.data[:, :1],
                            savepath=savepath)
Exemple #5
0
def train(info: TrainInformation, split, fold):
    """주어진 split에 대한 학습과 테스트를 진행한다."""
    bs = info.BS
    init_lr = info.INIT_LR
    lr_decay = info.LR_DECAY
    momentum = info.MOMENTUM
    weight_decay = info.WEIGHT_DECAY
    optimizer_method = info.OPTIMIZER_METHOD
    epoch = info.EPOCH
    nchs = info.NCHS
    filename = info.FILENAME
    model_name = info.MODEL_NAME
    exp_name = info.NAME

    print("Using File {}".format(filename))

    train_dataset = Dataset(split=split, fold=fold, phase="train", filename=filename, use_data_dropout=info.USE_DATA_DROPOUT)
    #val_dataset = Dataset(split=split, fold=fold, phase="val", filename=filename)
    test_dataset = Dataset(split=split, fold=fold, phase="test", filename=filename, use_data_dropout=False)

    model = get_classifier_model(model_name, train_dataset.feature_size, nchs, info.ACTIVATION)
    

    print(model)

    # Optimizer 설정
    optimizer = set_optimizer(
        optimizer_method, model, init_lr, weight_decay, momentum=momentum
    )

    data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=bs, shuffle=True, num_workers=0, drop_last=True
    )

    bce_loss = torch.nn.BCEWithLogitsLoss().cuda()
    train_result = TrainResult()
    train_result.set_sizes(
        len(train_dataset.data), 0, len(test_dataset.data)
    )

    for ep in range(epoch):
        global prev_plot
        prev_plot = 0
        train_step(
            exp_name,
            ep,
            model,
            train_dataset,
            test_dataset,
            optimizer,
            init_lr,
            lr_decay,
            data_loader,
            bce_loss,
            train_result,
        )

    savedir = "/content/drive/My Drive/research/frontiers/checkpoints/%s" % exp_name
    best_test_epoch = train_result.best_test_epoch #25
    savepath = "%s/epoch_%04d_fold_%02d.pt" % (savedir, best_test_epoch, train_dataset.split)
    #model.load_state_dict(torch.load(savepath))
    model = torch.load(savepath)
    model.eval()

    test_preds = train_utils.get_preds(test_dataset.data[:, 1:], model)
    test_AUC = train_utils.compute_AUC(test_dataset.data[:, :1], test_preds)
    test_PRAUC = train_utils.compute_PRAUC(test_dataset.data[:, :1], test_preds)

    train_utils.plot_AUC(test_dataset, test_preds, test_AUC, savepath=savepath.replace(".pt", "_AUC.tiff"))

    contributing_variables = compute_contributing_variables(model, test_dataset)
    with open(os.path.join(savedir, "contributing_variables_epoch_%04d_fold_%02d.txt" % (best_test_epoch, train_dataset.split)), "w") as f:
        for (v, auc) in contributing_variables:
            f.write("%s %f\n" % (v, auc))

    
    info.split_index = split
    info.result_dict = train_result
    info.save_result()
    return train_result
Exemple #6
0
def train_ml_compare(info: TrainInformation, split, fold):
    """주어진 split에 대한 학습과 테스트를 진행한다."""
    bs = info.BS
    init_lr = info.INIT_LR
    lr_decay = info.LR_DECAY
    momentum = info.MOMENTUM
    weight_decay = info.WEIGHT_DECAY
    optimizer_method = info.OPTIMIZER_METHOD
    epoch = info.EPOCH
    nchs = info.NCHS
    filename = info.FILENAME
    model_name = info.MODEL_NAME
    exp_name = info.NAME

    print("Using File {}".format(filename))

    train_dataset = Dataset(split=split, fold=fold, phase="train", filename=filename, use_data_dropout=info.USE_DATA_DROPOUT)
    #val_dataset = Dataset(split=split, fold=fold, phase="val", filename=filename)
    test_dataset = Dataset(split=split, fold=fold, phase="test", filename=filename, use_data_dropout=False)

    train_input = train_dataset.train_data[:, 1:]
    train_label = test_dataset.train_data[:, :1]

    # logisticregressoin ######################

    import sklearn.linear_model

    regressor = sklearn.linear_model.LogisticRegression()
    regressor.fit(train_input, train_label)
    preds_regressor = regressor.predict_proba(test_dataset.data[:, 1:])[:, 1]
    auc_regressor = train_utils.compute_AUC(test_dataset.data[:, :1], preds_regressor)
    TP, TN, FN, FP = confusion_matrix(test_dataset.data[:, :1], regressor.predict(test_dataset.data[:, 1:])).ravel()
    print(f'auc_regressor is {auc_regressor}')
    print("logistic regression TP, TN, FN, FP : {}, {}, {}, {}".format( TP, TN, FN, FP))
    ###########################################


    # randomforest ############################

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import make_classification

    forest = RandomForestClassifier()
    forest.fit(train_input, train_label)
    preds_forest = forest.predict_proba(test_dataset.data[:, 1:])[:, 1]
    auc_forest = train_utils.compute_AUC(test_dataset.data[:, :1], preds_forest)
    TP, TN, FN, FP = confusion_matrix(test_dataset.data[:, :1], forest.predict(test_dataset.data[:, 1:])).ravel()
    print(f'auc_forest is {auc_forest}')
    print("random forest TP, TN, FN, FP : {}, {}, {}, {}".format(TP, TN, FN, FP))

    ###########################################


    # svc #####################################

    from sklearn.svm import LinearSVC

    svc = LinearSVC()
    svc.fit(train_input, train_label)
    Y = svc.decision_function(test_dataset.data[:, 1:])
    preds_svc = (Y - Y.min()) / (Y.max() - Y.min())
    TP, TN, FN, FP = confusion_matrix(test_dataset.data[:, :1], svc.predict(test_dataset.data[:, 1:])).ravel()
    auc_svc = train_utils.compute_AUC(test_dataset.data[:, :1], preds_svc)
    print(f'auc_svc is {auc_svc}')
    print("svc TP, TN, FN, FP : {}, {}, {}, {}".format(TP, TN, FN, FP))

    ###########################################

    # kneighbors ############################

    from sklearn.neighbors import KNeighborsClassifier

    kneighbors = KNeighborsClassifier()
    kneighbors.fit(train_input, train_label)
    preds_kneighbors = kneighbors.predict_proba(test_dataset.data[:, 1:])[:, 1]
    auc_kneighbors = train_utils.compute_AUC(test_dataset.data[:, :1], preds_kneighbors)
    TP, TN, FN, FP = confusion_matrix(test_dataset.data[:, :1], kneighbors.predict(test_dataset.data[:, 1:])).ravel()
    print(f'auc_kneighbors is {auc_kneighbors}')
    print("kneighbors TP, TN, FN, FP : {}, {}, {}, {}".format(TP, TN, FN, FP))

    ###########################################

    savepath = "/content/drive/My Drive/research/frontiers/checkpoints/ml_compare/split_%02d.tiff" % split
    os.makedirs(os.path.dirname(savepath), exist_ok=True)

    model = get_classifier_model(model_name, train_dataset.feature_size, nchs, info.ACTIVATION)
    savedir = "/content/drive/My Drive/research/frontiers/checkpoints/%s" % exp_name
    best_test_epoch = 25 # train_result.best_test_epoch
    loadpath = "%s/epoch_%04d_fold_%02d.pt" % (savedir, best_test_epoch, train_dataset.split)
    #model.load_state_dict(torch.load(savepath))
    model = torch.load(loadpath)
    model.eval()

    test_preds = train_utils.get_preds(test_dataset.data[:, 1:], model)
    train_utils.plot_AUC_v2([('Deep learning (AUC 0.870)', test_preds), ('Logistic regression (AUC 0.858)', preds_regressor), ('Linear SVM (AUC 0.849)', preds_svc), ('Random forest classifier (AUC 0.810)', preds_forest), ('K-nearest neighbors (AUC 0.740)', preds_kneighbors)], test_dataset.data[:, :1], savepath=savepath)