Example #1
0
def main():
    opts = util.parse_args()
    X, y = util.data_load(opts.dataset)
    n = opts.upsamplen if opts.upsamplen is not None else 1
    start = n if opts.upsamplestart is None else 1
    if start > n:
        print("Upsample start should be larger than end")
        sys.exit()
    thresh = opts.threshold if opts.threshold is not None and opts.threshold >= 0.40 else None
    for t in np.arange(start, n + 1):
        needed = util.needed_n(X, y, t)
        temp_X, temp_y = util.upsample(X, y, needed)
        X_train, X_test, y_train, y_test = train_test_split(temp_X,
                                                            temp_y,
                                                            test_size=0.3,
                                                            random_state=42)
        X_train, X_test = util.normalize(X_train, X_test)
        clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        clf.fit(X_train, y_train)
        conf_upsample = []
        if thresh is None:
            predictions = clf.predict(X_test)
            conf_mat = confusion_matrix(y_test, predictions)
            conf_upsample.append(conf_mat)
            print(conf_mat)
        else:
            conf_thresh = []
            for i in np.arange(0.4, thresh + 0.01, 0.005):
                predictions = (clf.predict_proba(X_test)[:, 1] >=
                               i).astype(int)
                conf_mat = confusion_matrix(y_test, predictions)
                conf_thresh.append(conf_mat)
                print(i)
                print(conf_mat)
            util.get_roc_curve(conf_thresh, "Adaboost", "threshold")
            plt.show()
                                         steps=len(test_generator))

# <a name='5-1'></a>
# ### 5.1 ROC Curve and AUROC
# We'll cover topic of model evaluation in much more detail in later weeks, but for now we'll walk through computing a metric called the AUC (Area Under the Curve) from the ROC ([Receiver Operating Characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)) curve. This is also referred to as the AUROC value, but you will see all three terms in reference to the technique, and often used almost interchangeably.
#
# For now, what you need to know in order to interpret the plot is that a curve that is more to the left and the top has more "area" under it, and indicates that the model is performing better.
#
# We will use the `util.get_roc_curve()` function which has been provided for you in `util.py`. Look through this function and note the use of the `sklearn` library functions to generate the ROC curves and AUROC values for our model.
#
# - [roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html)
# - [roc_auc_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)

# In[27]:

auc_rocs = util.get_roc_curve(labels, predicted_vals, test_generator)

# You can compare the performance to the AUCs reported in the original ChexNeXt paper in the table below:

# For reference, here's the AUC figure from the ChexNeXt paper which includes AUC values for their model as well as radiologists on this dataset:
#
# <img src="https://journals.plos.org/plosmedicine/article/figure/image?size=large&id=10.1371/journal.pmed.1002686.t001" width="80%">
#
# This method does take advantage of a few other tricks such as self-training and ensembling as well, which can give a significant boost to the performance.

# For details about the best performing methods and their performance on this dataset, we encourage you to read the following papers:
# - [CheXNet](https://arxiv.org/abs/1711.05225)
# - [CheXpert](https://arxiv.org/pdf/1901.07031.pdf)
# - [ChexNeXt](https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.1002686)

# <a name='5-2'></a>
Example #3
0
def main():
    train_data = Data_analysis(train, image_dir=train_image_dir)
    train_data.data_leakage(validation)
    train_df = train_data.data_insight()
    label = train_df.drop(['Image'], 1, inplace=False).columns

    val_data = Data_analysis(validation, image_dir=val_image_dir)
    val_df = val_data.data_insight()

    test_data = Data_analysis(test, image_dir=test_image_dir)
    test_df = test_data.data_insight()

    X_2 = image_preprocessing(original_example=original_example,
                              image_dir=train_image_dir,
                              train_df=train_df,
                              valid_df=val_df,
                              test_df=test_df,
                              labels=label,
                              batch_size=10,
                              val_dir=val_image_dir,
                              test_dir=test_image_dir,
                              target_w=320,
                              target_h=320)
    train_generator = X_2.get_train_generator()
    y_true_train = train_generator.labels

    val_generator, test_generator = X_2.get_test_val_generator()
    y_true_val = val_generator.labels
    y_true_test = test_generator.labels

    positive_frequencies, negative_frequencies, w_p, w_n = class_frequency_prediction(
        y_true_train)

    values = np.mean(y_true_train, axis=0)
    sn.barplot(values, label, order=label)
    plt.yticks(fontsize=13)
    plt.title("Frequency of Each Class", fontsize=14)
    plt.show()

    data = pd.DataFrame({
        'Class':
        label,
        "Positive_freq":
        positive_frequencies,
        "Negative_freq":
        negative_frequencies,
        "Total_freq":
        positive_frequencies + negative_frequencies
    })
    data.plot.bar(x="Class",
                  y=["Positive_freq", "Negative_freq", "Total_freq"],
                  figsize=(15, 15),
                  color=['Blue', 'Red', 'Yellow'])
    plt.yticks(fontsize=16)
    plt.xticks(fontsize=16, rotation=20)
    plt.legend(fontsize=16)
    plt.show()

    model = pretrained_model(labels=label, pos_weights=w_p, neg_weights=w_n)
    #model.load_weights('efficent_net_b1_trained_weights.h5')
    model.load_weights('D:/material_science/pretrained_model.h5')
    '''history = model.fit_generator(train_generator, 
                              validation_data=val_generator,
                              steps_per_epoch= 1, 
                              validation_steps=1, 
                              epochs = 20, callbacks=[lr_schedule, checkpoint, EarlyStopping])'''
    #print(len(y_true_val), len(y_true_train), len(y_true_test))
    # summarize history for loss
    #plt.plot(history.history['loss'])
    #plt.plot(history.history['val_loss'])
    #plt.title('model loss')
    #plt.ylabel('loss')
    #plt.xlabel('epoch')
    #plt.legend(['train', 'test'], loc='upper left')
    #plt.savefig('plot.png')
    #plt.show()
    #model.save_weights("model.h5")

    predicted_vals = model.predict_generator(test_generator,
                                             steps=len(test_generator))
    df = pd.DataFrame(data=predicted_vals)
    df.to_csv('predections.csv')
    auc_rocs = get_roc_curve(label, predicted_vals, test_generator)