Esempio n. 1
0
    def loss(self, batch, model, dataset):
        margin_multiplier = self.conf['loss']['gamma']
        margin_type = 'nmi'

        embeddings = dataset.get_embeddings(batch, model, None)
        pairwise_distances = compute_pairwise_distances(
            embeddings, embeddings, DistanceFunction.EUCLIDEAN_DISTANCE)
        _, labels = batch
        all_ids = tf.range(tf.shape(embeddings)[0])

        chosen_ids = compute_augmented_facility_locations(
            pairwise_distances, labels, all_ids, margin_multiplier,
            margin_type)
        chosen_ids = compute_augmented_facility_locations_pam(
            pairwise_distances, labels, margin_multiplier, margin_type,
            chosen_ids)
        score_pred = compute_facility_energy(pairwise_distances, chosen_ids)

        predictions = get_cluster_assignment(pairwise_distances, chosen_ids)
        clustering_score_pred = compute_clustering_score(
            labels, predictions, margin_type)
        score_gt = compute_gt_cluster_score(pairwise_distances, labels)

        clustering_loss = tf.maximum(
            score_pred + margin_multiplier * (1.0 - clustering_score_pred) -
            score_gt, 0.0)
        clustering_loss.set_shape([])

        return clustering_loss
Esempio n. 2
0
    def get_pairwise_distances(self,
                               batch,
                               model,
                               distance_function,
                               training=True):
        images, labels = batch
        embeddings = model(images, training=training)

        q_bias = self.conf['batch_design'].get('q_bias', 1.0)
        if self.conf['batch_design'].get('npair'):
            pairwise_distances, matching_labels_matrix = get_npair_distances(
                embeddings, self.conf['batch_design']['npair'],
                distance_function)
            weights = self.get_npair_pairwise_weights(
                labels, self.conf['batch_design']['npair'], model.extra_info)
            return (
                tf.reshape(pairwise_distances, [-1]),
                tf.reshape(matching_labels_matrix, [-1]),
                tf.reshape(1 / weights, [-1])**q_bias,
            )
        else:
            group_size = self.conf['batch_design']['group_size']
            pairwise_distances = compute_pairwise_distances(
                embeddings, embeddings, distance_function)
            matching_labels_matrix = pairwise_matching_matrix(labels, labels)
            weights = self.get_pairwise_weights(labels, group_size,
                                                model.extra_info)
            return (
                upper_triangular_part(pairwise_distances),
                upper_triangular_part(matching_labels_matrix),
                upper_triangular_part(1 / weights)**q_bias,
            )
Esempio n. 3
0
 def testPairwiseCosineSimilarity(self):
     embeddings = tf.constant([
         [0., 1.],
         [1., 0.],
     ])
     c = -compute_pairwise_distances(embeddings, embeddings,
                                     DistanceFunction.COSINE_SIMILARITY)
     self.assertAllEqual(c, [
         [1., 0.],
         [0., 1.],
     ])
Esempio n. 4
0
 def testPairwiseDotProduct(self):
     embeddings = tf.constant([
         [0, 1],
         [0, 2],
         [0, 3],
     ])
     y = -compute_pairwise_distances(embeddings, embeddings,
                                     DistanceFunction.DOT_PRODUCT)
     self.assertAllEqual(y, [
         [1, 2, 3],
         [2, 4, 6],
         [3, 6, 9],
     ])
Esempio n. 5
0
 def testPairwiseEuclideanDifference(self):
     embeddings = tf.constant([
         [0, 1],
         [0, 2],
         [0, 3],
     ])
     y = compute_pairwise_distances(
         embeddings, embeddings,
         DistanceFunction.EUCLIDEAN_DISTANCE_SQUARED)
     self.assertAllEqual(y, [
         [0, 1, 4],
         [1, 0, 1],
         [4, 1, 0],
     ])
Esempio n. 6
0
def get_npair_distances(embeddings, n, distance_function, transpose=False):
    num_groups = int(embeddings.shape[0]) // 2
    evens = tf.range(num_groups, dtype=tf.int64) * 2
    odds = tf.range(num_groups, dtype=tf.int64) * 2 + 1
    even_embeddings = tf.gather(embeddings, evens)
    odd_embeddings = tf.gather(embeddings, odds)

    pairwise_distances = compute_pairwise_distances(even_embeddings,
                                                    odd_embeddings,
                                                    distance_function)

    return (get_n_blocks(pairwise_distances, n, transpose=transpose),
            get_n_blocks(tf.cast(tf.eye(num_groups), tf.bool),
                         n,
                         transpose=transpose))
Esempio n. 7
0
 def testPairwiseDifference2(self):
     first = tf.constant([
         [0, 1],
         [0, 2],
         [0, 3],
     ])
     second = tf.constant([
         [0, 1],
     ])
     y = compute_pairwise_distances(
         first, second, DistanceFunction.EUCLIDEAN_DISTANCE_SQUARED)
     self.assertAllEqual(y, [
         [0],
         [1],
         [4],
     ])
Esempio n. 8
0
 def get_raw_pairwise_distances(self,
                                batch,
                                model,
                                distance_function,
                                training=True):
     images, labels = batch
     embeddings = model(images, training=training)
     group_size = self.conf['batch_design']['group_size']
     pairwise_distances = compute_pairwise_distances(
         embeddings, embeddings, distance_function)
     matching_labels_matrix = pairwise_matching_matrix(labels, labels)
     weights = self.get_pairwise_weights(labels, group_size,
                                         model.extra_info)
     q_bias = self.conf['batch_design'].get('q_bias', 1.0)
     return (
         pairwise_distances,
         matching_labels_matrix,
         (1 / weights)**q_bias,
     )
Esempio n. 9
0
def compute_recall(embeddings_list, labels_list, k_list, distance_function):
    successes = defaultdict(float)
    total = 0.
    num_singletons = 0
    data = list(zip(embeddings_list, labels_list))
    batches = tqdm(data,
                   total=len(embeddings_list),
                   desc='recall',
                   dynamic_ncols=True)
    for i, (embeddings, labels) in enumerate(batches):
        all_labels = []
        distance_blocks = []
        for j, (test_embeddings, test_labels) in enumerate(data):
            all_labels += list(test_labels.numpy())
            pairwise_distances = compute_pairwise_distances(
                embeddings, test_embeddings, distance_function)
            if i == j:
                pairwise_distances = pairwise_distances + \
                         tf.eye(int(pairwise_distances.shape[0])) * 1e6
            distance_blocks.append(pairwise_distances)

        values, indices = tf.nn.top_k(-tf.concat(distance_blocks, axis=1),
                                      max(k_list))
        top_labels = tf.gather(tf.constant(all_labels, tf.int64), indices)
        for k in k_list:
            score = tf.reduce_sum(tf.cast(
                tf.equal(tf.transpose(labels[None]), top_labels[:, 0:k]),
                tf.int32),
                                  axis=1)
            successes[k] += int(sum(tf.cast(score >= 1, tf.int32)))
        total += int(embeddings.shape[0])
        num_singletons = count_singletons(all_labels)
    return {
        k: success / float(total - num_singletons)
        for k, success in successes.items()
    }
Esempio n. 10
0
    def create_dataset(self,
                       model,
                       image_files,
                       labels,
                       batch_conf,
                       testing=False):
        data_map = defaultdict(int)
        for image_file, label in zip(image_files, labels):
            data_map[label] += 1
        min_images_per_class = max(
            self.conf['dataset'].get('min_images_per_class', 1),
            self.conf['batch_design']['group_size'])
        data_map = dict(
            filter(lambda x: x[1] >= min_images_per_class, data_map.items()))
        model.extra_info['num_images'] = sum([y for x, y in data_map.items()])
        model.extra_info['num_labels'] = len(data_map)
        if batch_conf.get('negative_class_mining'):
            data_loader = DataLoader.create(self.conf['dataset']['name'],
                                            self.conf)
            batch_design = BatchDesign.create('vanilla', self.conf,
                                              {'data_loader': data_loader})
            batch_size = 48
            test_dataset, num_testcases = batch_design.create_dataset(
                model,
                image_files,
                labels, {
                    'name': 'vanilla',
                    'batch_size': batch_size
                },
                testing=True)
            test_dataset = test_dataset.batch(batch_size)
            batches = tqdm(test_dataset,
                           total=math.ceil(num_testcases / batch_size),
                           desc='embedding',
                           dynamic_ncols=True)
            embeddings_list = []
            labels_list = []
            for mini_images, mini_labels in batches:
                embeddings = model(mini_images, training=False)
                embeddings_list.append(embeddings)
                labels_list.append(mini_labels)
            full_embeddings = tf.concat(embeddings_list, axis=0)
            full_labels = tf.concat(labels_list, axis=0)
            means = []
            mean_distances = []
            num_labels = model.extra_info['num_labels']
            for label in range(num_labels):
                label_embeddings = tf.boolean_mask(
                    full_embeddings, tf.equal(full_labels, label))
                label_embeddings_mean = tf.reduce_mean(label_embeddings,
                                                       axis=0)
                means.append(label_embeddings_mean)
                mean_distances.append(
                    float(
                        tf.reduce_mean(
                            tf.reduce_sum(tf.square(label_embeddings -
                                                    label_embeddings_mean),
                                          axis=1))))
            closest_inter_distances = []
            for index, mean in enumerate(means):
                same_labels = tf.eye(num_labels)[index:index + 1]
                closest_inter_distances.append(
                    float(
                        tf.reduce_min(compute_pairwise_distances(
                            mean[None], tf.stack(means),
                            DistanceFunction.EUCLIDEAN_DISTANCE) +
                                      same_labels * 1e6,
                                      axis=1)))
            within_mean_distances = stable_sqrt(tf.constant(mean_distances))
            self.cache['class_weights'] = tf.clip_by_value(
                within_mean_distances / tf.constant(closest_inter_distances),
                0.2, 5)

        data = []
        for _ in range(batch_conf['num_batches'] *
                       batch_conf.get('combine_batches', 1)):
            elements = self.get_next_batch(image_files, labels, batch_conf)
            data += elements

        return tf.data.Dataset.zip(
            self._create_datasets_from_elements(data, testing), ), len(data)