def main(): opts = util.parse_args() X, y = util.data_load(opts.dataset) n = opts.upsamplen if opts.upsamplen is not None else 1 start = n if opts.upsamplestart is None else 1 if start > n: print("Upsample start should be larger than end") sys.exit() thresh = opts.threshold if opts.threshold is not None and opts.threshold >= 0.40 else None for t in np.arange(start, n + 1): needed = util.needed_n(X, y, t) temp_X, temp_y = util.upsample(X, y, needed) X_train, X_test, y_train, y_test = train_test_split(temp_X, temp_y, test_size=0.3, random_state=42) X_train, X_test = util.normalize(X_train, X_test) clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) conf_upsample = [] if thresh is None: predictions = clf.predict(X_test) conf_mat = confusion_matrix(y_test, predictions) conf_upsample.append(conf_mat) print(conf_mat) else: conf_thresh = [] for i in np.arange(0.4, thresh + 0.01, 0.005): predictions = (clf.predict_proba(X_test)[:, 1] >= i).astype(int) conf_mat = confusion_matrix(y_test, predictions) conf_thresh.append(conf_mat) print(i) print(conf_mat) util.get_roc_curve(conf_thresh, "Adaboost", "threshold") plt.show()
steps=len(test_generator)) # <a name='5-1'></a> # ### 5.1 ROC Curve and AUROC # We'll cover topic of model evaluation in much more detail in later weeks, but for now we'll walk through computing a metric called the AUC (Area Under the Curve) from the ROC ([Receiver Operating Characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)) curve. This is also referred to as the AUROC value, but you will see all three terms in reference to the technique, and often used almost interchangeably. # # For now, what you need to know in order to interpret the plot is that a curve that is more to the left and the top has more "area" under it, and indicates that the model is performing better. # # We will use the `util.get_roc_curve()` function which has been provided for you in `util.py`. Look through this function and note the use of the `sklearn` library functions to generate the ROC curves and AUROC values for our model. # # - [roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html) # - [roc_auc_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html) # In[27]: auc_rocs = util.get_roc_curve(labels, predicted_vals, test_generator) # You can compare the performance to the AUCs reported in the original ChexNeXt paper in the table below: # For reference, here's the AUC figure from the ChexNeXt paper which includes AUC values for their model as well as radiologists on this dataset: # # <img src="https://journals.plos.org/plosmedicine/article/figure/image?size=large&id=10.1371/journal.pmed.1002686.t001" width="80%"> # # This method does take advantage of a few other tricks such as self-training and ensembling as well, which can give a significant boost to the performance. # For details about the best performing methods and their performance on this dataset, we encourage you to read the following papers: # - [CheXNet](https://arxiv.org/abs/1711.05225) # - [CheXpert](https://arxiv.org/pdf/1901.07031.pdf) # - [ChexNeXt](https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.1002686) # <a name='5-2'></a>
def main(): train_data = Data_analysis(train, image_dir=train_image_dir) train_data.data_leakage(validation) train_df = train_data.data_insight() label = train_df.drop(['Image'], 1, inplace=False).columns val_data = Data_analysis(validation, image_dir=val_image_dir) val_df = val_data.data_insight() test_data = Data_analysis(test, image_dir=test_image_dir) test_df = test_data.data_insight() X_2 = image_preprocessing(original_example=original_example, image_dir=train_image_dir, train_df=train_df, valid_df=val_df, test_df=test_df, labels=label, batch_size=10, val_dir=val_image_dir, test_dir=test_image_dir, target_w=320, target_h=320) train_generator = X_2.get_train_generator() y_true_train = train_generator.labels val_generator, test_generator = X_2.get_test_val_generator() y_true_val = val_generator.labels y_true_test = test_generator.labels positive_frequencies, negative_frequencies, w_p, w_n = class_frequency_prediction( y_true_train) values = np.mean(y_true_train, axis=0) sn.barplot(values, label, order=label) plt.yticks(fontsize=13) plt.title("Frequency of Each Class", fontsize=14) plt.show() data = pd.DataFrame({ 'Class': label, "Positive_freq": positive_frequencies, "Negative_freq": negative_frequencies, "Total_freq": positive_frequencies + negative_frequencies }) data.plot.bar(x="Class", y=["Positive_freq", "Negative_freq", "Total_freq"], figsize=(15, 15), color=['Blue', 'Red', 'Yellow']) plt.yticks(fontsize=16) plt.xticks(fontsize=16, rotation=20) plt.legend(fontsize=16) plt.show() model = pretrained_model(labels=label, pos_weights=w_p, neg_weights=w_n) #model.load_weights('efficent_net_b1_trained_weights.h5') model.load_weights('D:/material_science/pretrained_model.h5') '''history = model.fit_generator(train_generator, validation_data=val_generator, steps_per_epoch= 1, validation_steps=1, epochs = 20, callbacks=[lr_schedule, checkpoint, EarlyStopping])''' #print(len(y_true_val), len(y_true_train), len(y_true_test)) # summarize history for loss #plt.plot(history.history['loss']) #plt.plot(history.history['val_loss']) #plt.title('model loss') #plt.ylabel('loss') #plt.xlabel('epoch') #plt.legend(['train', 'test'], loc='upper left') #plt.savefig('plot.png') #plt.show() #model.save_weights("model.h5") predicted_vals = model.predict_generator(test_generator, steps=len(test_generator)) df = pd.DataFrame(data=predicted_vals) df.to_csv('predections.csv') auc_rocs = get_roc_curve(label, predicted_vals, test_generator)