Example #1
0
    def evaluate(y_true, y_pred, img_filenames):
        y_true = y_true.detach()
        y_pred = y_pred.detach()
        y_true_set.append(y_true.cpu())
        y_pred_set.append(y_pred.cpu())

        measurer.add_sample(y_true, y_pred)

        # ==== Find Threshold per damage level
        for idx, img_filename in enumerate(img_filenames):
            disaster_type = img_filename.split('_')[0]
            if disaster_type not in diaster_type_measurers.keys():
                diaster_type_measurers[disaster_type] = MultiThresholdMetric(
                    F1_THRESH)

            diaster_type_measurers[disaster_type].add_sample(
                y_true[[idx], ...], y_pred[[idx], ...])
Example #2
0
def model_eval(net,
               cfg,
               device,
               run_type='TEST',
               max_samples=1000,
               step=0,
               epoch=0):
    '''
    Runner that is concerned with training changes
    :param run_type: 'train' or 'eval'
    :return:
    '''

    F1_THRESH = torch.linspace(0, 1, 100).to(device)
    y_true_set = []
    y_pred_set = []

    measurer = MultiThresholdMetric(F1_THRESH)

    def evaluate(y_true, y_pred, img_filename):
        y_true = y_true.detach()
        y_pred = y_pred.detach()
        y_true_set.append(y_true.cpu())
        y_pred_set.append(y_pred.cpu())

        measurer.add_sample(y_true, y_pred)

    # transformations
    trfm = []
    trfm.append(Npy2Torch())
    trfm = transforms.Compose(trfm)

    if run_type == 'TRAIN':
        dataset = UrbanExtractionDatasetAugmentedLabels(
            cfg=cfg,
            root_dir=cfg.DATASETS.TRAIN[0],
            include_index=True,
            transform=trfm)
        inference_loop(net,
                       cfg,
                       device,
                       evaluate,
                       run_type='TRAIN',
                       max_samples=max_samples,
                       dataset=dataset)
    elif run_type == 'TEST':
        dataset = UrbanExtractionDatasetAugmentedLabels(
            cfg=cfg,
            root_dir=cfg.DATASETS.TEST[0],
            include_index=True,
            transform=trfm)
        inference_loop(net,
                       cfg,
                       device,
                       evaluate,
                       max_samples=max_samples,
                       dataset=dataset)

    # Summary gathering ===

    print('Computing F1 score ', end=' ', flush=True)
    # Max of the mean F1 score

    # measurer = MultiThresholdMetric(y_true_set, y_pred_set, F1_THRESH)
    # Max F1

    f1 = measurer.compute_f1()
    fpr, fnr = measurer.compute_basic_metrics()
    maxF1 = f1.max()
    argmaxF1 = f1.argmax()
    best_fpr = fpr[argmaxF1]
    best_fnr = fnr[argmaxF1]
    print(maxF1.item(), flush=True)

    set_name = 'test_set' if run_type == 'TEST' else 'training_set'
    wandb.log({
        f'{set_name} max F1': maxF1,
        f'{set_name} argmax F1': argmaxF1,
        # f'{set_name} Average Precision': ap,
        f'{set_name} false positive rate': best_fpr,
        f'{set_name} false negative rate': best_fnr,
        'step': step,
        'epoch': epoch,
    })
Example #3
0
def model_eval(net,
               cfg,
               device,
               run_type='TEST',
               max_samples=1000,
               step=0,
               epoch=0):
    '''
    Runner that is concerned with training changes
    :param run_type: 'train' or 'eval'
    :return:
    '''

    F1_THRESH = torch.linspace(0, 1, 100).to(device)
    y_true_set = []
    y_pred_set = []
    cc_difference_ratio_abs = []
    kernel = np.ones((5, 5), np.uint8)

    measurer = MultiThresholdMetric(F1_THRESH)

    def evaluate(y_true, y_pred, img_filename):

        if cfg.MODEL.OUT_CHANNELS == 4:  #
            # y_true = y_true[:,2:,:,:].sum(1)
            # Y True is already binary, because in eval we are not transformign the label to 4 classes
            y_pred = y_pred[:, 2:, :, :].sum(1)

        y_true = y_true.detach()
        y_pred = y_pred.detach()
        y_true_set.append(y_true.cpu())
        y_pred_set.append(y_pred.cpu())

        measurer.add_sample(y_true, y_pred)

        y_pred_thresh = (y_pred > 0.52).type(
            torch.int8)  # 0.52 == best threshold

        # ==== Measuring connected components
        y_true_thresh_numpy = y_pred_thresh.cpu().squeeze().numpy()
        true_cc = cv2.connectedComponentsWithStats(y_true_thresh_numpy.astype(
            np.int8),
                                                   connectivity=8)[0]

        y_pred_numpy = y_pred.cpu().squeeze().numpy()
        y_pred_denoised = cv2.morphologyEx(y_pred_numpy, cv2.MORPH_OPEN,
                                           kernel)
        pred_cc = cv2.connectedComponentsWithStats(y_pred_denoised.astype(
            np.int8),
                                                   connectivity=8)[0]

        cc_ratio = np.abs(true_cc - pred_cc) / true_cc
        cc_difference_ratio_abs.append(cc_ratio)

    if run_type == 'TRAIN':
        inference_loop(net,
                       cfg,
                       device,
                       evaluate,
                       run_type='TRAIN',
                       max_samples=max_samples)
    elif run_type == 'TEST':
        inference_loop(net, cfg, device, evaluate, max_samples=max_samples)

    # Summary gathering ===

    print('Computing F1 score ', end=' ', flush=True)
    # Max of the mean F1 score

    # measurer = MultiThresholdMetric(y_true_set, y_pred_set, F1_THRESH)
    # Max F1

    f1 = measurer.compute_f1()
    fpr, fnr = measurer.compute_basic_metrics()
    maxF1 = f1.max()
    argmaxF1 = f1.argmax()
    best_fpr = fpr[argmaxF1]
    best_fnr = fnr[argmaxF1]
    print(maxF1.item(), flush=True)

    avg_cc_difference = np.mean(cc_difference_ratio_abs)
    print(avg_cc_difference)
    set_name = 'test_set' if run_type == 'TEST' else 'training_set'
    wandb.log({
        f'{set_name} max F1': maxF1,
        f'{set_name} argmax F1': argmaxF1,
        # f'{set_name} Average Precision': ap,
        f'{set_name} false positive rate': best_fpr,
        f'{set_name} false negative rate': best_fnr,
        f'avg abs connected components ratio': avg_cc_difference,
        'step': step,
        'epoch': epoch,
    })
Example #4
0
def final_model_evaluation_runner(net, cfg):
    '''
    Runner that only concerns with only a single model,
    :return:
    '''
    import matplotlib.pyplot as plt
    print('=== Evaluating final model ===')
    # Setup

    F1_THRESH = torch.linspace(0, 1, 100).to(device)
    y_true_set = []
    y_pred_set = []
    measurer = MultiThresholdMetric(F1_THRESH)
    diaster_type_measurers = {}

    def evaluate(y_true, y_pred, img_filenames):
        y_true = y_true.detach()
        y_pred = y_pred.detach()
        y_true_set.append(y_true.cpu())
        y_pred_set.append(y_pred.cpu())

        measurer.add_sample(y_true, y_pred)

        # ==== Find Threshold per damage level
        for idx, img_filename in enumerate(img_filenames):
            disaster_type = img_filename.split('_')[0]
            if disaster_type not in diaster_type_measurers.keys():
                diaster_type_measurers[disaster_type] = MultiThresholdMetric(
                    F1_THRESH)

            diaster_type_measurers[disaster_type].add_sample(
                y_true[[idx], ...], y_pred[[idx], ...])

    inference_loop(net, cfg, device, evaluate)

    # ===
    # Collect for summary

    y_true_set = torch.cat(y_true_set, dim=0).round()
    y_pred_set = torch.cat(y_pred_set, dim=0)

    y_true_set, y_pred_set = downsample_dataset_for_eval(
        y_true_set, y_pred_set)

    y_true_np = to_numpy(y_true_set.flatten())
    y_pred_np = to_numpy(y_pred_set.flatten())

    # F1 score
    print('Computing F1 vs thresholds', flush=True)
    f1 = measurer.compute_f1().cpu().numpy()
    plt.plot(np.arange(0, 100, 1, dtype=np.int32), f1)
    plt.ylabel('f1 score')
    plt.xlabel('threshold ')
    plt.title('F1 vs threshold curve')

    wandb.log({'f1 vs threshold': plt})
    wandb.log({
        'total False Negative': measurer.FN.cpu(),
        'total False Positive': measurer.FP.cpu(),
    })

    for disaster_type, m in diaster_type_measurers.items():
        f1 = m.compute_f1().cpu().numpy()

        plt.plot(np.arange(0, 100, 1, dtype=np.int32), f1)
        plt.ylabel('f1 score')
        plt.xlabel('threshold ')
        plt.title('F1 vs threshold curve')

        print(f'{disaster_type} max F1', f1.max())

        wandb.log({f'disaster-{disaster_type}': plt})

    print('computing ROC curve', flush=True)
    # ROC curve
    fpr, tpr, thresh = roc_curve(y_true_np, y_pred_np)

    # Down sample roc curve or matplotlib won't like it
    num_ele = len(fpr)
    downsample_idx = np.linspace(0,
                                 num_ele,
                                 1000,
                                 dtype=np.int32,
                                 endpoint=False)
    fpr_downsampled = fpr[downsample_idx]
    tpr_downsampled = tpr[downsample_idx]

    plt.plot(fpr_downsampled, tpr_downsampled)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.ylabel('true_positive rate')
    plt.xlabel('false_positive rate')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('ROC curve')

    wandb.log({'roc': plt})

    print('done')
def model_eval(net,
               cfg,
               device,
               run_type='TEST',
               max_samples=1000,
               step=0,
               epoch=0):
    '''
    Runner that is concerned with training changes
    :param run_type: 'train' or 'eval'
    :return:
    '''

    F1_THRESH = torch.linspace(0, 1, 100).to(device)
    y_true_set = []
    y_pred_set = []
    cc_difference_ratio_abs = []
    kernel = np.ones((5, 5), np.uint8)

    measurer = MultiThresholdMetric(F1_THRESH)

    def evaluate(y_true, y_pred, img_filename):
        y_true = y_true.detach()
        y_pred = y_pred.detach()
        y_true_set.append(y_true.cpu())
        y_pred_set.append(y_pred.cpu())

        measurer.add_sample(y_true, y_pred)

        y_pred_thresh = (y_pred > 0.52).type(
            torch.int8)  # 0.52 == best threshold

    if run_type == 'TRAIN':
        inference_loop(net,
                       cfg,
                       device,
                       evaluate,
                       run_type='TRAIN',
                       max_samples=max_samples)
    elif run_type == 'TEST':
        inference_loop(net, cfg, device, evaluate, max_samples=max_samples)

    # Summary gathering ===

    print('Computing F1 score ', end=' ', flush=True)
    # Max of the mean F1 score

    # measurer = MultiThresholdMetric(y_true_set, y_pred_set, F1_THRESH)
    # Max F1

    f1 = measurer.compute_f1()
    fpr, fnr = measurer.compute_basic_metrics()
    maxF1 = f1.max()
    argmaxF1 = f1.argmax()
    best_fpr = fpr[argmaxF1]
    best_fnr = fnr[argmaxF1]
    print(maxF1.item(), flush=True)

    set_name = 'test_set' if run_type == 'TEST' else 'training_set'
    wandb.log({
        f'{set_name} max F1': maxF1,
        f'{set_name} argmax F1': argmaxF1,
        # f'{set_name} Average Precision': ap,
        f'{set_name} false positive rate': best_fpr,
        f'{set_name} false negative rate': best_fnr,
        'step': step,
        'epoch': epoch,
    })