Ejemplo n.º 1
0
def prepare_data_siamese(data,
                         objective="malignancy",
                         rating_distance='mean',
                         balanced=False,
                         verbose=0):
    if verbose:
        print('prepare_data_siamese:')
    images, ratings, classes, masks, meta, conf, nod_size, _, _ = \
        prepare_data(data, rating_format='raw', reshuffle=True, verbose=verbose)

    N = len(images)
    benign_filter = np.where(classes == 0)[0]
    malign_filter = np.where(classes == 1)[0]
    M = min(benign_filter.shape[0], malign_filter.shape[0])

    if balanced:
        malign_filter = malign_filter[:M]
        benign_filter = benign_filter[:M]

    #   Handle Patches
    # =========================

    imgs_benign, imgs_malign = [images[x] for x in benign_filter
                                ], [images[x] for x in malign_filter]
    different, d_size = select_different_pair(imgs_benign, imgs_malign, n=M)
    same, sb_size, sm_size = select_same_pairs(imgs_benign, imgs_malign)

    image_pairs = same + different
    image_sub1 = [pair[0] for pair in image_pairs]
    image_sub2 = [pair[1] for pair in image_pairs]

    if objective == "malignancy":
        similarity_labels = np.concatenate(
            [np.repeat(0, len(same)),
             np.repeat(1, len(different))])
    elif objective == "rating":
        lbls_benign, lbls_malign = ratings[benign_filter], ratings[
            malign_filter]
        diff_lbls, d_size = select_different_pair(lbls_benign,
                                                  lbls_malign,
                                                  n=M)
        same_lbls, sb_size, sm_size = select_same_pairs(
            lbls_benign, lbls_malign)

        label_pairs = same_lbls + diff_lbls
        if rating_distance == 'mean':
            similarity_labels = np.array(
                [np.sqrt((a - b).dot(a - b)) for a, b in label_pairs])
        elif rating_distance == 'clusters':
            assert False
        else:
            assert False
    else:
        print("ERR: {} is not a valid objective".format(objective))
        assert (False)

    #   Handle Masks
    # =========================

    mask_benign, mask_malign = [masks[x] for x in benign_filter
                                ], [masks[x] for x in malign_filter]
    different_mask, d = select_different_pair(mask_benign, mask_malign, n=M)
    same_mask, sb, sm = select_same_pairs(mask_benign, mask_malign)
    assert (d == d_size)
    assert ((sb == sb_size) and (sm == sm_size))

    mask_pairs = same_mask + different_mask
    mask_sub1 = [pair[0] for pair in mask_pairs]
    mask_sub2 = [pair[1] for pair in mask_pairs]

    #   Handle Meta
    # =========================
    meta_benign, meta_malign = reorder(meta, benign_filter), reorder(
        meta, malign_filter)
    different_meta, d = select_different_pair(meta_benign, meta_malign, n=M)
    same_meta, sb, sm = select_same_pairs(meta_benign, meta_malign)
    assert (d == d_size)
    assert ((sb == sb_size) and (sm == sm_size))

    meta_pairs = same_meta + different_meta
    meta_sub1, meta_sub2 = zip(*meta_pairs)

    #   Final touch
    # =========================

    size = similarity_labels.shape[0]
    assert size == len(image_sub1)
    assert size == len(image_sub2)

    # assign confidence classes (weights are resolved online per batch)
    confidence = np.concatenate([
        np.repeat('SB', sb_size),
        np.repeat('SM', sm_size),
        np.repeat('D', d_size)
    ])

    if verbose:
        print(
            "{} pairs of same / {} pairs of different. {} total number of pairs"
            .format(len(same), len(different), size))

    new_order = np.random.permutation(size)

    return ((reorder(image_sub1, new_order), reorder(image_sub2, new_order)),
            similarity_labels[new_order], (reorder(mask_sub1, new_order),
                                           reorder(mask_sub2, new_order)),
            confidence[new_order], (reorder(meta_sub1, new_order),
                                    reorder(meta_sub2, new_order)))
Ejemplo n.º 2
0
def prepare_data(data,
                 rating_format='raw',
                 reshuffle=False,
                 verbose=0,
                 scaling="none"):
    # Entry:
    # 0 'patch'
    # 1 'mask'
    # 2 'class'
    # 3 'info'
    # 4 'size
    # 5 'rating'
    # 6 'rating_weights'
    # 7 'z'

    N = len(data)
    old_size = data[0][0].shape

    # ============================
    #   data: images and masks
    # ============================
    if data[0][0].ndim == 2:
        images = [np.expand_dims(entry[0], axis=-1) for entry in data]
        masks = [np.expand_dims(entry[1], axis=-1) for entry in data]
    else:
        images = [np.array(entry[0]) for entry in data]
        masks = [np.array(entry[1]) for entry in data]

    if verbose:
        print('prepare_data:')
        print("\tImage size changed from {} to {}".format(
            old_size, images[0].shape))
        print("\tImage Range = [{:.1f}, {:.1f}]".format(
            np.max(images[0]), np.min(images[0])))
        print("\tMasks Range = [{}, {}]".format(np.max(masks[0]),
                                                np.min(masks[0])))

    # ============================
    #   labels: classes and ratings
    # ============================

    classes = np.array([entry[2] for entry in data]).reshape(N, 1)

    rating_weights = None
    if rating_format == 'raw':
        ratings = np.array(
            [rating_normalize(entry[5], scaling) for entry in data])
        rating_weights = np.array([entry[6] for entry in data])
    elif rating_format == 'mean':
        ratings = np.array([
            rating_normalize(np.mean(entry[5], axis=0), scaling)
            for entry in data
        ]).reshape(N, 9)
    elif rating_format == 'w_mean':
        w_mean = lambda R, W: np.sum(np.diag(W).dot(R) / np.sum(W), axis=0)
        ratings = np.array([
            rating_normalize(w_mean(entry[5], entry[6]), scaling)
            for entry in data
        ]).reshape(N, 9)
    else:
        print("ERR: Illegual rating_format given ({})".format(rating_format))
        assert (False)

    if verbose:
        print("benign:{}, malignant: {}, unknown: {}".format(
            np.count_nonzero(classes == 0), np.count_nonzero(classes == 1),
            np.count_nonzero(classes == 2)))

    # ============================
    #   meta: meta, nodule-size, slice confidence and z-value
    # ============================

    # for nodule-size use the rescaled mask area
    #
    # nodule_size = np.array([entry[4] for entry in data]).reshape(N, 1)
    # sorted_size = np.sort(nodule_size, axis=0).flatten()
    # L = len(sorted_size)
    # tresh = sorted_size[range(0, L, L//5)]
    nodule_size = np.array([np.count_nonzero(q)
                            for q in masks]).reshape(N, 1) * 0.5 * 0.5
    tresh = [0, 15, 30, 60, 120]
    nodule_size = np.digitize(nodule_size, tresh)

    z = np.array([entry[7] for entry in data]).reshape(N, 1)

    # confidence
    # only relevant for full dataset and should first be reconsidered
    # conf = np.array([np.min(entry[6]) for entry in data])
    # mean rating based objective
    conf = 1 - .5 * np.array([
        rating_normalize(np.std(entry[5], axis=0).mean(), scaling)
        for entry in data
    ])

    meta = [entry[3] for entry in data]

    if reshuffle:
        new_order = np.random.permutation(N)
        # print('permutation: {}'.format(new_order[:20]))
        images = reorder(images, new_order)
        masks = reorder(masks, new_order)
        classes = classes[new_order]
        ratings = ratings[new_order]
        rating_weights = rating_weights[
            new_order] if rating_weights is not None else None
        meta = reorder(meta, new_order)
        nodule_size = nodule_size[new_order]
        z = z[new_order]
        conf = conf[new_order]

    return images, ratings, classes, masks, meta, conf, nodule_size, rating_weights, z
Ejemplo n.º 3
0
def prepare_data_siamese_simple(data,
                                siamese_rating_factor,
                                objective="malignancy",
                                rating_distance='mean',
                                verbose=0):
    if verbose:
        print('prepare_data_siamese_simple:')
    images, ratings, classes, masks, meta, conf, nod_size, rating_weights, z = \
        prepare_data(data, rating_format='raw', scaling="none", reshuffle=True, verbose=verbose)

    N = len(images)

    #   Handle Patches
    # =========================

    image_pairs = select_pairs(images)
    image_sub1 = [pair[0] for pair in image_pairs]
    image_sub2 = [pair[1] for pair in image_pairs]

    #   Handle Labels
    # =========================

    rating_pairs = select_pairs(ratings)
    rating_weight_pairs = select_pairs(rating_weights)

    confidence = np.ones(len(image_pairs))

    if objective in ["rating", "rating_size"]:

        if rating_distance == 'mean':
            similarity_ratings = np.array(
                [np.sqrt((a - b).dot(a - b)) for a, b in rating_pairs])
        elif rating_distance == 'clusters':
            similarity_ratings = []
            confidence = []
            for r1, r2 in rating_pairs:
                distance, std = rating_clusters_distance_and_std(r1, r2)
                similarity_ratings += [distance]
                confidence += [std]
            similarity_ratings = np.array(similarity_ratings)
            confidence = np.array(confidence)
            confidence = 1 - .5 * confidence / (confidence + .5)
        elif 'weighted_clusters':
            similarity_ratings = []
            confidence = []
            for r, w in zip(rating_pairs, rating_weight_pairs):
                distance, std = rating_clusters_distance_and_std(r[0],
                                                                 r[1],
                                                                 'euclidean',
                                                                 weight_a=w[0],
                                                                 weight_b=w[1])
                similarity_ratings += [distance]
                confidence += [std]
            similarity_ratings = np.array(similarity_ratings)
            confidence = np.array(confidence)
            confidence = 1 - .5 * confidence / (confidence + .5)
        else:
            assert False
        similarity_ratings *= siamese_rating_factor

    if objective in ['size', 'rating_size']:
        size_pairs = select_pairs(nod_size)
        similarity_size = np.array(
            [np.sqrt((a - b).dot(a - b)) for a, b in size_pairs])

        if similarity_size.ndim == 1:
            similarity_ratings = np.expand_dims(similarity_size, axis=1)

    if similarity_ratings.ndim == 1:
        similarity_ratings = np.expand_dims(similarity_ratings, axis=1)

    if objective == "rating":
        similarity_labels = similarity_ratings,
    elif objective == 'size':
        similarity_labels = similarity_size,
    elif objective == 'rating_size':
        similarity_labels = similarity_ratings, similarity_size
    else:
        print("ERR: {} is not a valid objective".format(objective))
        assert False

    #   Handle Masks
    # =========================

    mask_pairs = select_pairs(masks)
    mask_sub1 = [pair[0] for pair in mask_pairs]
    mask_sub2 = [pair[1] for pair in mask_pairs]

    #   Handle Meta
    # =========================
    meta_pairs = select_pairs(meta)
    meta_sub1 = [pair[0] for pair in meta_pairs]
    meta_sub2 = [pair[1] for pair in meta_pairs]

    #   Final touch
    # =========================

    size = similarity_labels[0].shape[0]
    assert size == len(image_sub1)
    assert size == len(image_sub2)

    # assign confidence classes (weights are resolved online per batch)
    #confidence = np.concatenate([  np.repeat('SB', sb_size),
    #                               np.repeat('SM', sm_size),
    #                               np.repeat('D',  d_size)
    #                            ])

    #confidence = np.repeat('SB', N)
    #onfidence = []
    #for r1, r2 in rating_pairs:
    #    dm = cdist(r1, r2, 'euclidean')
    #    d0 = np.max(dm, axis=0)
    #    d1 = np.max(dm, axis=1)
    #    distance = 0.5 * np.mean(d0) + 0.5 * np.mean(d1)
    #    confidence += [distance]
    #confidence = 1.0 - np.array(confidence)/(8.0 + 0.25*np.array(confidence))

    new_order = np.random.permutation(size)

    return ((reorder(image_sub1, new_order), reorder(image_sub2, new_order)),
            tuple([s[new_order] for s in similarity_labels
                   ]), (reorder(mask_sub1, new_order),
                        reorder(mask_sub2, new_order)), confidence[new_order],
            (reorder(meta_sub1, new_order), reorder(meta_sub2, new_order)))
Ejemplo n.º 4
0
def prepare_data_triplet(data,
                         objective="malignancy",
                         rating_distance="mean",
                         balanced=False,
                         return_confidence=False,
                         return_meta=False,
                         verbose=0):
    if verbose:
        print('prepare_data_triplet:')
    images, ratings, classes, masks, meta, conf, nod_size, rating_weights, z \
        = prepare_data(data, rating_format="raw", scaling="none", reshuffle=True, verbose=verbose)

    N = len(images)

    if balanced:
        print('Create a balanced split')
        benign_filter = np.where(classes == 0)[0]
        malign_filter = np.where(classes == 1)[0]
        M = min(benign_filter.shape[0], malign_filter.shape[0])
        M12 = M // 2
        M = M12 * 2
        malign_filter_a = malign_filter[:M12]
        malign_filter_b = malign_filter[M12:]
        benign_filter_a = benign_filter[:M12]
        benign_filter_b = benign_filter[M12:]
        expected_size = 2 * M
    else:
        #assert rating_distance is not 'weighted_clusters'
        rating_trips = select_triplets(ratings)
        if rating_distance == 'mean':
            distance = l2_distance
            trips_for_distance_calc = rating_trips,
        elif rating_distance == 'clusters':
            distance = rating_clusters_distance
            trips_for_distance_calc = rating_trips,
        elif rating_distance == 'weighted_clusters':
            weights_trips = select_triplets(rating_weights)
            distance = rating_clusters_distance
            trips_for_distance_calc = rating_trips, weights_trips

        trip_rank_status = check_triplet_order(trips_for_distance_calc[0],
                                               rating_distance=distance)
        expected_size = N

    #   Handle Patches
    # =========================

    if balanced:
        image_trips = make_balanced_trip(images, benign_filter_a,
                                         benign_filter_b, malign_filter_a,
                                         malign_filter_b)
    else:
        image_trips = select_triplets(images)
        image_trips = arrange_triplet(image_trips, trip_rank_status)
    image_sub1 = np.array([pair[0] for pair in image_trips])
    image_sub2 = np.array([pair[1] for pair in image_trips])
    image_sub3 = np.array([pair[2] for pair in image_trips])

    similarity_labels = np.array([[0, 1]] * N)

    #   Handle Masks
    # =========================

    if balanced:
        mask_trips = make_balanced_trip(masks, benign_filter_a,
                                        benign_filter_b, malign_filter_a,
                                        malign_filter_b)
    else:
        mask_trips = select_triplets(masks)
        mask_trips = arrange_triplet(mask_trips, trip_rank_status)
    mask_sub1 = np.array([pair[0] for pair in mask_trips])
    mask_sub2 = np.array([pair[1] for pair in mask_trips])
    mask_sub3 = np.array([pair[2] for pair in mask_trips])

    #   Handle Meta
    # =========================
    if return_meta:
        if balanced:
            meta_trips = make_balanced_trip(meta, benign_filter_a,
                                            benign_filter_b, malign_filter_a,
                                            malign_filter_b)
        else:
            meta_trips = select_triplets(meta)
            meta_trips = arrange_triplet(meta_trips, trip_rank_status)
        meta_sub1 = np.array([pair[0] for pair in meta_trips])
        meta_sub2 = np.array([pair[1] for pair in meta_trips])
        meta_sub3 = np.array([pair[2] for pair in meta_trips])

    #   Final touch
    # =========================

    size = image_sub1.shape[0]
    assert expected_size == size

    confidence = np.repeat('SB', size)
    if objective == 'rating':
        if return_confidence == "rating":
            conf_trips = select_triplets(conf)
            conf_trips = arrange_triplet(conf_trips, trip_rank_status)
            confidence = get_triplet_confidence(conf_trips)
            confidence = np.array(confidence)
        elif return_confidence == "rating_distance":
            confidence = calc_rating_distance_confidence(trip_rank_status)
            confidence = np.array(confidence)

    new_order = np.random.permutation(size)

    if return_meta:
        return ((image_sub1[new_order], image_sub2[new_order]),
                similarity_labels[new_order], (mask_sub1[new_order],
                                               mask_sub2[new_order]),
                confidence[new_order], (reorder(meta_sub1, new_order),
                                        reorder(meta_sub2, new_order),
                                        reorder(meta_sub3, new_order)))
    else:
        return ((image_sub1[new_order], image_sub2[new_order],
                 image_sub3[new_order]), similarity_labels[new_order],
                (mask_sub1[new_order], mask_sub2[new_order],
                 mask_sub3[new_order]), confidence[new_order])