Esempio n. 1
0
def vectorized_negatives_loss(image_outputs, audio_outputs, negatives_output,
                              nframes, margin, symfun):
    """
    Computes the triplet margin ranking loss for each anchor image/caption pair using the specific negative, in a
    vectorized way
    """
    # I = image_outputs.view(image_outputs.size(0), embedding_dim)
    # A = audio_outputs.view(audio_outputs.size(0), embedding_dim)
    # num_negatives = len(negatives_output)
    num_units = image_outputs.size(1)
    output_loss = []

    n = image_outputs.size(0)
    loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type())
    first = 0
    last = num_units
    # first = (j*num_units)//num_negatives
    # last = ((j+1)*num_units)//num_negatives
    for i in range(n):
        nF = nframes[i]
        anchorsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i][first:last],
                                   audio_outputs[i][first:last, :,
                                                    0:nF], symfun))
        Aimpsim = utils.matchmap_sim(
            utils.compute_matchmap(negatives_output[i][first:last],
                                   audio_outputs[i][first:last, :,
                                                    0:nF], symfun))
        I2A_simdif = margin + Aimpsim - anchorsim
        output_loss.append(I2A_simdif)

    return output_loss
Esempio n. 2
0
def combined_random_sampled_margin_rank_loss(image_outputs, audio_outputs,
                                             negatives_output, nframes, margin,
                                             symfun):
    """
    Computes the triplet margin ranking loss for each anchor image/caption pair using both a random negative from the
    positive images batch, and also the specific negative for the image. The returned loss for each sample is the
    highest of the two
    """
    # I = image_outputs.view(image_outputs.size(0), embedding_dim)
    # A = audio_outputs.view(audio_outputs.size(0), embedding_dim)
    n = image_outputs.size(0)

    loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type())

    for i in range(n):
        I_imp_ind = i
        A_imp_ind = i
        while I_imp_ind == i:
            I_imp_ind = np.random.randint(0, n)
        while A_imp_ind == i:
            A_imp_ind = np.random.randint(0, n)
        nF = nframes[i]
        nFimp = nframes[A_imp_ind]

        anchorsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Iimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[I_imp_ind],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[A_imp_ind][:, :,
                                                            0:nFimp], symfun))

        anchorsim_neg = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim_neg = utils.matchmap_sim(
            utils.compute_matchmap(negatives_output[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        I2A_simdif_neg = margin + Aimpsim_neg - anchorsim_neg

        A2I_simdif = margin + Iimpsim - anchorsim
        if (A2I_simdif.data > 0).all():
            loss = loss + A2I_simdif

        I2A_simdif = margin + Aimpsim - anchorsim
        if (I2A_simdif.data > 0).all():
            loss = loss + torch.max(I2A_simdif, I2A_simdif_neg)

    loss = loss / n
    return loss
Esempio n. 3
0
    def segment_images_iter(self):
        images = {}
        audios = {}
        counter_images = 0
        for batch_id, (image_input, audio_input, _, nframes, path, image_raw) in enumerate(self.dataloader):

            v_init = self.z[int(path[0])]
            z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0])

            for k in range(audio_input.size(0)):
                z_img[k, :] = self.z[int(path[k])]

            image_input = self.generator.generate_images(z_img, intervention=None)
            image_input = utils.transform(image_input)

            audio_input = audio_input.cuda(async=True)

            model_output = self.model(image_input, audio_input, [])
            image_output = model_output[0]
            audio_output = model_output[1]

            pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
            nframes = nframes.div(pooling_ratio)

            # Compute matchmap to detect where there are important concepts that we want to cluster (this time in image)
            for i in range(image_input.shape[0]):
                nF = nframes[i]
                matchmap_i = utils.compute_matchmap(image_output[i], audio_output[i][:, :, 0:nF])
                matchmap_i_mean = matchmap_i.mean(2).view(-1)
                indexes = np.where(matchmap_i_mean > 0.9 * matchmap_i_mean.max())[0]
                features_im = image_output[i].view(image_output.shape[1], -1)[..., indexes].cpu().numpy()

                product = np.matmul(self.centroids, features_im)

                # For each selected superpixel in the image, find top 5 concepts
                seg_image = {}
                for j, index in enumerate(indexes):
                    clust = np.argsort(-product[:, j])[:5]
                    seg_image[index] = clust
                images[path[i]] = seg_image

                # Also for the audio, for testing purposes
                matchmap_i_max = matchmap_i.max(1)[0].max(0)[0]
                indexes = np.where(matchmap_i_max > 0.9 * matchmap_i_max.max())[0]
                features_au = audio_output[i].view(audio_output.shape[1], -1)[..., indexes].cpu().numpy()

                product = np.matmul(self.centroids, features_au)

                # For each selected superpixel in the image, find top 5 concepts
                seg_audio = {}
                for j, index in enumerate(indexes):
                    clust = np.argsort(-product[:, j])[:5]
                    seg_audio[index + 20] = clust
                audios[path[i]] = seg_audio

                counter_images += 1
                if counter_images >= self.num_images_segment:
                    return images, audios

        return images, audios
Esempio n. 4
0
def sampled_margin_rank_loss(image_outputs, audio_outputs, nframes, margin,
                             symfun):
    """
    Computes the triplet margin ranking loss for each anchor image/caption pair
    The impostor image/caption is randomly sampled from the minibatch
    """
    # I = image_outputs.view(image_outputs.size(0), embedding_dim)
    # A = audio_outputs.view(audio_outputs.size(0), embedding_dim)
    n = image_outputs.size(0)

    loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type())

    for i in range(n):
        I_imp_ind = i
        A_imp_ind = i
        while I_imp_ind == i:
            I_imp_ind = np.random.randint(0, n)
        while A_imp_ind == i:
            A_imp_ind = np.random.randint(0, n)
        nF = nframes[i]
        nFimp = nframes[A_imp_ind]

        anchorsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Iimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[I_imp_ind],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[A_imp_ind][:, :,
                                                            0:nFimp], symfun))

        A2I_simdif = margin + Iimpsim - anchorsim
        if (A2I_simdif.data > 0).all():
            loss = loss + A2I_simdif

        I2A_simdif = margin + Aimpsim - anchorsim
        if (I2A_simdif.data > 0).all():
            loss = loss + I2A_simdif

    loss = loss / n
    return loss
    def segment_batch(self, tensor_images, downsample=1):
        '''
        Returns a multilabel segmentation for the given batch of (RGB [-1...1])
        images.  Each pixel of the result is a torch.long indicating a
        predicted class number.  Multiple classes can be predicted for
        the same pixel: output shape is (n, multipred, y, x), where
        multipred is 3, 5, or 6, for how many different predicted labels can
        be given for each pixel (depending on whether subdivision is being
        used).  If downsample is specified, then the output y and x dimensions
        are downsampled from the original image.
        '''
        output_images, _, _ = self.model(tensor_images, None,
                                         [])  # N x 512 x W x H
        output_seg = []
        t = 1.5

        for i in range(len(output_images)):
            c_trans = torch.transpose(self.clusters, 0, 1)
            c_trans = c_trans[:, None, :]  # 512 x T
            clust_mean = self.mean_clust.view(-1, 1, 1)
            clust_mean = clust_mean.expand(output_images[i].size(0),
                                           output_images[i].size(1),
                                           output_images[i].size(2))
            std_clust = self.std_clust.view(-1, 1, 1)
            std_clust = std_clust.expand(output_images[i].size(0),
                                         output_images[i].size(1),
                                         output_images[i].size(2))

            im_normalized = (output_images[i] - clust_mean) / (std_clust +
                                                               1e-8)

            matchmap = utils.compute_matchmap(im_normalized,
                                              c_trans)  # H x W x N_clusters

            matchmap = matchmap.permute(2, 0, 1)  # N_c x H x W

            matchmap = torch.nn.functional.interpolate(matchmap[None, :, :, :],
                                                       size=(64, 64),
                                                       mode='bilinear')[0]

            matchmap = nn.Threshold(self.threshold, 0)(matchmap)
            matchmap = -nn.Threshold(-0.1, -1)(-matchmap)

            seg = torch.zeros(self.clusters.size(0), matchmap.size(1),
                              matchmap.size(2)).long().cuda()
            for c in range(self.clusters.size(0)):
                seg[c, :, :] = ((c + 1) * matchmap[c, :, :]).long()
            output_seg.append(seg)

        output_seg = torch.stack(output_seg)
        return output_seg
    def predict_single_class(self, tensor_images, classnum, downsample=1):
        '''
        Given a batch of images (RGB, normalized to [-1...1]) and
        a specific segmentation class number, returns a tuple with
           (1) a differentiable ([0..1]) prediction score for the class
               at every pixel of the input image.
           (2) a binary mask showing where in the input image the
               specified class is the best-predicted label for the pixel.
        Does not work on subdivided labels.
        '''
        output_images, _, _ = self.model(tensor_images, None,
                                         [])  # N x 512 x W x H
        output_seg = []
        for i in range(len(output_images)):
            c_trans = torch.transpose(self.clusters, 0, 1)
            c_trans = c_trans[:, None, :]  # 512 x T
            clust_mean = self.mean_clust.view(-1, 1, 1)
            clust_mean = clust_mean.expand(output_images[i].size(0),
                                           output_images[i].size(1),
                                           output_images[i].size(2))
            std_clust = self.std_clust.view(-1, 1, 1)
            std_clust = std_clust.expand(output_images[i].size(0),
                                         output_images[i].size(1),
                                         output_images[i].size(2))
            im_normalized = (output_images[i] - clust_mean) / (std_clust +
                                                               1e-8)

            matchmap = utils.compute_matchmap(im_normalized,
                                              c_trans[:, :, classnum:classnum +
                                                      1])  # H x W x N_clusters

            matchmap = matchmap[:, :, classnum - 1]
            matchmap = nn.Threshold(self.threshold, 0)(matchmap)
            matchmap = matchmap / (torch.sum(matchmap[:]) + 1e-8)
            output_seg.append(matchmap)

        output_seg = torch.stack(output_seg)
        return output_seg
Esempio n. 7
0
def hard_negative_loss(image_outputs, audio_outputs, nframes, margin, symfun):
    """
    Computes the triplet margin ranking loss for each anchor image/caption pair using the hardes sample from the
    positive images batch
    """
    # I = image_outputs.view(image_outputs.size(0), embedding_dim)
    # A = audio_outputs.view(audio_outputs.size(0), embedding_dim)
    n = image_outputs.size(0)

    with torch.no_grad():
        N = image_outputs.size(0)
        similarity_loss = torch.zeros(N, N, requires_grad=False).type(
            image_outputs.data.type())
        D = image_outputs.size(1)
        H = image_outputs.size(2)
        W = image_outputs.size(3)
        T = audio_outputs.size(3)
        image_outputs_hard = image_outputs.detach()
        audio_outputs_hard = audio_outputs.detach()
        image_outputs_hard = image_outputs_hard.view(N, 1, D, H, W).expand(
            N, N, D, H, W).contiguous().view(-1, D, H, W)
        audio_outputs_hard = audio_outputs_hard.view(1, N, D, 1, T).expand(
            N, N, D, 1, T).contiguous().view(-1, D, 1, T)
        match_hard = utils.compute_matchmap_vectorized(
            image_outputs_hard, audio_outputs_hard).view(N, N, H, W, T)
        match_hard, _ = match_hard.max(3)
        match_hard, _ = match_hard.max(2)
        for i in range(N):
            similarity_loss[:, i] = match_hard[:, i, 0:nframes[i]].mean(1)

    loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type())
    for i in range(n):
        _, rank_image = torch.topk(similarity_loss[:, i], 3)
        _, rank_audio = torch.topk(similarity_loss[i, :], 3)

        I_imp_ind = rank_image[0]
        A_imp_ind = rank_audio[0]
        if I_imp_ind == i:
            I_imp_ind = rank_image[1]

        if A_imp_ind == i:
            A_imp_ind = rank_audio[1]

        nF = nframes[i]
        if A_imp_ind < nframes.size(0):
            nFimp = nframes[A_imp_ind]
        else:
            nFimp = 16

        anchorsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Iimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[I_imp_ind],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[A_imp_ind][:, :,
                                                            0:nFimp], symfun))
        A2I_simdif = margin + Iimpsim - anchorsim

        if (A2I_simdif.data > 0).all():
            loss = loss + A2I_simdif
        I2A_simdif = margin + Aimpsim - anchorsim

        if (I2A_simdif.data > 0).all():
            loss = loss + I2A_simdif

    loss = loss / n
    return loss
Esempio n. 8
0
def combined_random_hard_negative_loss(image_outputs, audio_outputs,
                                       negatives_output, nframes, margin,
                                       symfun):
    """
    Computes the triplet margin ranking loss for each anchor image/caption pair using both the hardes negative in the
    positive images batch, and also the specific negative for the image. Returns the highest of the two losses for each
    sample
    """
    # I = image_outputs.view(image_outputs.size(0), embedding_dim)
    # A = audio_outputs.view(audio_outputs.size(0), embedding_dim)
    n = image_outputs.size(0)

    with torch.no_grad():
        N = image_outputs.size(0)
        similarity_loss = torch.zeros(N, N, requires_grad=False).type(
            image_outputs.data.type())
        D = image_outputs.size(1)
        H = image_outputs.size(2)
        W = image_outputs.size(3)
        T = audio_outputs.size(3)
        image_outputs_hard = image_outputs.detach()
        audio_outputs_hard = audio_outputs.detach()
        image_outputs_hard = image_outputs_hard.view(N, 1, D, H, W).expand(
            N, N, D, H, W).contiguous().view(-1, D, H, W)
        audio_outputs_hard = audio_outputs_hard.view(1, N, D, 1, T).expand(
            N, N, D, 1, T).contiguous().view(-1, D, 1, T)
        match_hard = utils.compute_matchmap_vectorized(
            image_outputs_hard, audio_outputs_hard).view(N, N, H, W, T)
        match_hard, _ = match_hard.max(3)
        match_hard, _ = match_hard.max(2)
        for i in range(N):
            similarity_loss[:, i] = match_hard[:, i, 0:nframes[i]].mean(1)

    loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type())
    for i in range(n):
        if n >= 2:
            _, rank_image = torch.topk(similarity_loss[:, i], 2)
            _, rank_audio = torch.topk(similarity_loss[i, :], 2)
            I_imp_ind = rank_image[0]
            A_imp_ind = rank_audio[0]
            if I_imp_ind == i:
                I_imp_ind = rank_image[1]

            if A_imp_ind == i:
                A_imp_ind = rank_audio[1]

            I_imp_ind = max(min(image_outputs.size(0) - 1, I_imp_ind), 0)
            A_imp_ind = max(min(image_outputs.size(0) - 1, A_imp_ind), 0)
        else:
            I_imp_ind = 0
            A_imp_ind = 0

            # I_imp_ind = max(min(image_outputs.size(0)-1,I_imp_ind),0)
            # A_imp_ind =max(min(image_outputs.size(0)-1,A_imp_ind),0)

        nF = nframes[i]
        if A_imp_ind < nframes.size(0):
            nFimp = nframes[A_imp_ind]
        else:
            nFimp = 16

        anchorsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Iimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[I_imp_ind],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[A_imp_ind][:, :,
                                                            0:nFimp], symfun))
        A2I_simdif = margin + Iimpsim - anchorsim

        anchorsim_neg = utils.matchmap_sim(
            utils.compute_matchmap(image_outputs[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        Aimpsim_neg = utils.matchmap_sim(
            utils.compute_matchmap(negatives_output[i],
                                   audio_outputs[i][:, :, 0:nF], symfun))
        I2A_simdif_neg = margin + Aimpsim_neg - anchorsim_neg

        if (A2I_simdif.data > 0).all():
            loss = loss + A2I_simdif

        I2A_simdif = margin + Aimpsim - anchorsim

        if (I2A_simdif.data > 0).all():
            loss = loss + torch.max(I2A_simdif, I2A_simdif_neg)

    loss = loss / n
    return loss
Esempio n. 9
0
    def train_epoch(self, epoch):
        """
        Train one epoch. It consists of 5 steps
        Step 1: Compute the output of the positive image
        Step 2: Compute the mask for the positive image features
        Step 3: Generate the negative image from this mask
        Step 4: Compute the output of this negative
        Step 5: Compute all the losses
        And after that, do the backpropagation and weight updates
        """
        if not self.args.use_cpu:
            torch.cuda.synchronize()
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses_meter = utils.AverageMeter()

        # Switch to train mode
        self.model.train()

        end = time.time()
        N_examples = self.loaders['train'].dataset.__len__()

        loss_list_total = {
            'loss_regular': 0,
            'loss_neg': 0,
            'loss_hardneg': 0,
            'loss_total': 0
        }
        for batch_id, (image_input, audio_input, neg_images, nframes, path,
                       image_raw) in enumerate(self.loaders['train']):
            loss_list = {
                'loss_regular': 0,
                'loss_neg': 0,
                'loss_hardneg': 0,
                'loss_total': 0
            }

            # Measure data loading time
            data_time.update(time.time() - end)

            if not self.args.use_cpu:
                audio_input = audio_input.cuda(async=True)

            if not self.args.loading_image:
                path_ints = [p.split('/')[-1] for p in path
                             ]  # in case the audio is inside a subfolder

                v_init = self.z[int(path_ints[0])]
                z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0])

                for k in range(image_input.size(0)):
                    z_img[k, :] = self.z[int(path_ints[k])]

                image_input = self.generator.generate_images(z_img,
                                                             intervention=None)
                image_input = utils.transform(image_input).detach()

            else:
                image_input = image_input.cuda()
                neg_images = neg_images.cuda()

            # STEP 1: Compute output positive
            model_output = self.model(image_input, audio_input, [])
            image_output = model_output[0]
            audio_output = model_output[1]

            neg_images = []

            pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
            nframes.div_(pooling_ratio)

            binary_mask_0 = None

            # Only do steps 2-4 if we want to train with semantic negatives
            if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both':
                # STEP 2: Compute mask from image features
                limits = np.zeros((image_input.size(0), 2))

                for i in range(image_input.size(0)):
                    pos_image = image_input[i, :, :, :]

                    nF = nframes[i]

                    matchmap = utils.compute_matchmap(
                        image_output[i], audio_output[i][:, :, :nF])

                    matchmap = matchmap.data.cpu().numpy().copy()

                    matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                    matchmap = matchmap / (matchmap.max() + 1e-10)
                    matchmap_image = matchmap.max(axis=0)
                    threshold = 0.95

                    # ind_max = np.argmax(matchmap_image)
                    ind_max = np.argmax(matchmap)
                    ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1])
                    ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                             ) // matchmap.shape[1]
                    ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                             ) % matchmap.shape[1]

                    limits[i, 0] = ind_t
                    limits[i, 1] = ind_t + 1

                    if self.clustering:
                        if self.args.active_learning and 'active' in path[i]:
                            neg_img = active_learning.get_negatives(
                                self, path_ints[i])

                        else:
                            v = (image_output[i][:, ind_h, ind_w] -
                                 self.mean_clust.cuda()) / (
                                     self.std_clust.cuda() + 1e-8)

                            normalized_clusters = np.matmul(
                                self.clusters.cpu(),
                                v.detach().cpu().numpy().transpose())
                            sorted_val = -np.sort(-normalized_clusters[:])
                            sorted_val = np.clip(sorted_val, 0, 4)
                            if np.sum(sorted_val) <= 0:
                                print(
                                    "None of the clusters was close to the image feature. If this happens regularly, "
                                    "it probably means they were low quality clusters. Did you pretrain with a "
                                    "regular loss before clustering?")
                            prob_samples = sorted_val / np.sum(sorted_val)
                            sorted_id = np.argsort(-normalized_clusters[:])
                            cluster_id = sorted_id[0]

                            norm = 0
                            threshold_random = 0.95

                            # The number of units to be ablated grows if we cannot generate a good (changed) negative
                            # The following numbers are the starting number of units to change
                            num_units_dict = {
                                'layer2': 30,
                                'layer3': 30,
                                'layer4': 140,
                                'layer5': 30,
                                'layer6': 30
                            }
                            thresold_heatmap = threshold

                            count = 0
                            binary_mask_eval = matchmap_image > (
                                thresold_heatmap * matchmap_image.max())
                            binary_mask_eval = utils.geodesic_dilation(
                                binary_mask_eval, (ind_h, ind_w))
                            binary_mask_eval = cv2.resize(
                                binary_mask_eval, (128, 128))
                            bmask = torch.Tensor(binary_mask_eval).cuda()
                            bmask = bmask.view(1, 128, 128).expand(3, 128, 128)

                            while norm < threshold_random:
                                with torch.no_grad():
                                    binary_mask = matchmap_image > (
                                        thresold_heatmap *
                                        matchmap_image.max())
                                    binary_mask = utils.geodesic_dilation(
                                        binary_mask, (ind_h, ind_w))

                                    if binary_mask_0 is None:
                                        binary_mask_0 = cv2.resize(
                                            binary_mask, (224, 224))

                                    # STEP 3: Generate new image
                                    z_img = self.z[int(path_ints[i])]
                                    z_img = z_img[np.newaxis, :]

                                    _ = self.generator.generate_images(z_img)
                                    intervention = {}
                                    for layer_n in self.layer_list_all:
                                        units_ids = self.layers_units[layer_n][
                                            cluster_id][:num_units_dict[
                                                layer_n]]
                                        layer_size = self.layers_dict[layer_n][
                                            'size']
                                        layer_dim = self.layers_dict[layer_n][
                                            'depth']

                                        ablation, replacement = self.get_ablation_replacement(
                                            params=[layer_dim, units_ids],
                                            option='specific')
                                        ablation_final = cv2.resize(
                                            binary_mask,
                                            (layer_size, layer_size))
                                        ablation_final = np.tile(
                                            ablation_final,
                                            (layer_dim, 1, 1)).astype(
                                                np.float32)
                                        ablation_final = torch.cuda.FloatTensor(
                                            ablation_final)
                                        ablation_final = ablation.view(
                                            layer_dim, 1,
                                            1).expand_as(ablation_final
                                                         ) * ablation_final
                                        intervention[layer_n] = (
                                            ablation_final, replacement)

                                    neg_img = self.generator.generate_images(
                                        z_img,
                                        intervention=intervention).detach()
                                    neg_img_t = utils.transform(
                                        neg_img).detach()

                                    norm = (neg_img_t[0, :, :, :] -
                                            pos_image.detach())
                                    norm = norm * bmask
                                    norm = torch.norm(torch.norm(torch.norm(
                                        norm, dim=2),
                                                                 dim=1),
                                                      dim=0)
                                    norm_normalized = norm / torch.norm(
                                        torch.norm(torch.norm(
                                            pos_image.detach() * bmask, dim=2),
                                                   dim=1),
                                        dim=0)
                                    norm = norm_normalized.item()
                                    for layer_n in self.layer_list_all:
                                        num_units_dict[layer_n] = num_units_dict[
                                            layer_n] + 40  # increase units to change
                                    thresold_heatmap = thresold_heatmap - 0.1
                                    threshold_random = threshold_random - 0.05

                                    cluster_id = np.random.choice(
                                        sorted_id, size=1, p=prob_samples)[0]

                                    count = count + 1

                    else:  # random edited negatives
                        binary_mask = matchmap_image > (threshold *
                                                        matchmap_image.max())
                        binary_mask = utils.geodesic_dilation(
                            binary_mask, (ind_h, ind_w))
                        if binary_mask_0 is None:
                            binary_mask_0 = cv2.resize(binary_mask, (224, 224))
                        norm = 0
                        threshold_random = 0.95
                        p = 0.4

                        while norm < threshold_random:
                            with torch.no_grad():
                                intervention = {}

                                for layer_n in self.layer_list_all:
                                    layer_size = self.layers_dict[layer_n][
                                        'size']
                                    layer_dim = self.layers_dict[layer_n][
                                        'depth']

                                    ablation, replacement = self.get_ablation_replacement(
                                        params=[layer_dim, True, 0.5],
                                        option='random')
                                    ablation_final = cv2.resize(
                                        binary_mask, (layer_size, layer_size))
                                    ablation_final = np.tile(
                                        ablation_final,
                                        (layer_dim, 1, 1)).astype(np.float32)
                                    ablation_final = torch.cuda.FloatTensor(
                                        ablation_final)
                                    ablation_final = ablation.view(
                                        layer_dim, 1, 1).expand_as(
                                            ablation_final) * ablation_final
                                    intervention[layer_n] = (ablation_final,
                                                             replacement)

                                # STEP 3: Generate new image
                                z_img = self.z[int(path_ints[i])]
                                z_img = z_img[np.newaxis, :].detach()
                                neg_img = self.generator.generate_images(
                                    z_img, intervention=intervention).detach()
                                neg_img_t = utils.transform(neg_img).detach()

                                binary_mask = cv2.resize(
                                    binary_mask, (128, 128))

                                bmask = torch.Tensor(binary_mask).cuda()

                                bmask = bmask.view(1, 128,
                                                   128).expand(3, 128, 128)
                                norm = (neg_img_t[0, :, :, :] -
                                        pos_image.detach())

                                norm = norm * bmask
                                norm = torch.norm(torch.norm(torch.norm(norm,
                                                                        dim=2),
                                                             dim=1),
                                                  dim=0)
                                norm_normalized = norm / torch.norm(torch.norm(
                                    torch.norm(pos_image.detach() * bmask,
                                               dim=2),
                                    dim=1),
                                                                    dim=0)
                                norm = norm_normalized.item()

                                if random.random() > 0.2:
                                    p = p + 0.05
                                else:
                                    threshold_random = threshold_random - 0.01

                    neg_images.append(neg_img)

                neg_images = torch.cat(neg_images)
                neg_images_t = utils.transform(neg_images)
                # print(neg_images_t.size())

                # STEP 4: Compute output negative
                image_output_neg, _, _ = self.model(neg_images_t, None, [])

            # STEP 5: Compute losses
            if self.args.active_learning:
                image_output, image_output_neg = active_learning.switch_pos_neg(
                    self, image_input, image_output, image_output_neg, path)

            if self.loss_type == 'regular':
                loss = losses.sampled_margin_rank_loss(image_output,
                                                       audio_output, nframes,
                                                       self.margin,
                                                       self.args.symfun)
                loss_list['loss_regular'] = loss.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_edited':  # train with semantic negatives
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_neg = losses.negatives_loss(image_output, audio_output,
                                                 image_output_neg, nframes,
                                                 self.margin, self.args.symfun)
                loss = loss_regular + loss_neg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_neg'] = loss_neg.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_hard':  # train with hard negatives
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_neg = losses.hard_negative_loss(image_output,
                                                     audio_output, nframes,
                                                     self.margin,
                                                     self.args.symfun)
                loss = loss_regular + loss_neg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_neg'] = loss_neg.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_both':  # combine hard negatives with semantic negatives
                loss_hardneg = losses.combined_random_hard_negative_loss(
                    image_output, audio_output, image_output_neg, nframes,
                    self.margin, self.args.symfun)
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_regular = torch.clamp(loss_regular, min=0, max=5)
                loss_hardneg = torch.clamp(loss_hardneg, min=0, max=5)
                loss = loss_regular + loss_hardneg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_hardneg'] = loss_hardneg.item()
                loss_list['loss_total'] = loss.item()

            else:
                raise Exception(
                    f'The loss function {self.loss_type} is not implemented.')

            last_sample = N_examples * epoch + batch_id * self.args.batch_size + image_input.size(
                0)

            # Record loss
            losses_meter.update(loss.item(), image_input.size(0))

            # Backward pass and update
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # Measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # Print results
            if (batch_id + 1) % self.args.print_freq == 0:
                for name in loss_list:
                    loss_list_total[name] += loss_list[name]
                for name in loss_list:
                    loss_list_total[
                        name] = loss_list_total[name] / self.args.print_freq

                for loss_name in loss_list:
                    self.args.writer.add_scalar(f'losses/{loss_name}',
                                                loss_list_total[loss_name],
                                                last_sample)

                print(
                    f'Epoch: [{epoch}][{batch_id+1}/{len(self.loaders["train"])}]\t'
                    f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    f'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    f'Loss {losses_meter.val:.4f} ({losses_meter.avg:.4f})\t',
                    flush=True)

                image_raw = self.unorm(image_input[0].data.cpu())
                self.args.writer.add_image('positive', image_raw, last_sample)
                if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both':
                    image_raw_neg = self.unorm(neg_images[0].data.cpu())
                    image_neg = image_raw_neg / torch.max(image_raw_neg)
                    self.args.writer.add_image('negative', image_neg,
                                               last_sample)
                    self.args.writer.add_image(
                        'Images/region', 255 *
                        np.array([binary_mask_0, binary_mask_0, binary_mask_0
                                  ]).swapaxes(0, 1).swapaxes(1, 2),
                        last_sample)
                loss_list_total = {k: 0 for k, v in loss_list_total.items()}

            else:
                for loss_name in loss_list:
                    loss_list_total[loss_name] += loss_list[loss_name]
Esempio n. 10
0
    def get_datapoints(self):
        """
        Compute datapoints
        :return: datapoints and path names identifying all the datapoints
        """
        names_audio = []
        names_image = []
        finish = False

        dim = self.model_dim
        datapoints_image = np.zeros((self.max_datapoints, dim))
        datapoints_audio = np.zeros((self.max_datapoints, dim))
        datapoints_mul = np.zeros((self.max_datapoints, dim))
        current_datapoints_image = 0
        current_datapoints_audio = 0
        current_datapoints_mul = 0
        finish_image = False
        finish_audio = False

        for batch_id, (image_input, audio_input, _, nframes, path, image_raw) in enumerate(self.dataloader):
            # print(f'Current datapoints: ({current_datapoints_image}, '
            #       f'{current_datapoints_audio})/{self.max_datapoints}')
            if finish:
                break

            path_ints = [p.split('/')[-1] for p in path]  # in case the audio is inside a subfolder

            v_init = self.z[int(path_ints[0])]
            z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0])

            for k in range(audio_input.size(0)):
                z_img[k, :] = self.z[int(path_ints[k])]

            image_input = self.generator.generate_images(z_img, intervention=None)
            image_input = utils.transform(image_input)

            image_input = image_input.cuda(async=True)
            audio_input = audio_input.cuda(async=True)

            model_output = self.model(image_input, audio_input, [])
            image_output = model_output[0]
            audio_output = model_output[1]

            pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
            nframes.div_(pooling_ratio)

            # Compute matchmap to detect where there are important concepts that we want to cluster
            for i in range(image_input.shape[0]):
                nF = nframes[i]
                matchmap_i = utils.compute_matchmap(image_output[i], audio_output[i][:, :, 0:nF])

                matchmap = matchmap_i.data.cpu().numpy().copy()
                matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                matchmap = matchmap / matchmap.max()
                ind_max = np.argmax(matchmap)
                ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1])
                ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) // matchmap.shape[1]
                ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) % matchmap.shape[1]

                d_audio = audio_output[i][:, 0, ind_t].view(-1)
                d_image = image_output[i][:, ind_h, ind_w].view(-1)

                d_all = d_audio * d_image
                datapoints_mul[current_datapoints_mul:current_datapoints_mul + 1] = d_all.cpu().numpy()
                current_datapoints_mul = current_datapoints_mul + 1

                # Computing image
                matchmap_i_max = matchmap_i.mean(2).view(-1)
                structure = np.ones(3, dtype=np.int)
                labeled, ncomponents = label(matchmap_i_max.cpu() > 0.5 * matchmap_i_max.max().cpu(), structure)
                indexes = np.zeros(ncomponents)
                for n in range(ncomponents):
                    indexes[n] = np.array(np.where(labeled == n + 1)).mean().round().astype(int)
                num_datapoints = len(indexes)
                if current_datapoints_image + num_datapoints > self.max_datapoints:
                    num_datapoints = self.max_datapoints - current_datapoints_image
                datapoints_i = image_output[i].view(image_output.shape[1], -1)[:, indexes[:num_datapoints]]

                if num_datapoints > 0:
                    datapoints_image[current_datapoints_image:current_datapoints_image + num_datapoints] = \
                        datapoints_i.transpose(1, 0).cpu().numpy()

                names_i = []
                for index in indexes[:num_datapoints]:
                    names_i.append((path[i], index))
                names_image[current_datapoints_image:current_datapoints_image + num_datapoints] = names_i
                current_datapoints_image += num_datapoints
                if current_datapoints_image >= self.max_datapoints:
                    finish_image = True

                matchmap_i_max, _ = matchmap_i.max(1)
                matchmap_i_max, _ = matchmap_i_max.max(0)

                structure = np.ones(3, dtype=np.int)
                labeled, ncomponents = label(matchmap_i_max.cpu() > 0.5 * matchmap_i_max.max().cpu(), structure)
                indexes = np.zeros(ncomponents)
                for n in range(ncomponents):
                    indexes[n] = np.array(np.where(labeled == n + 1)).mean().round().astype(int)

                num_datapoints = len(indexes)

                if current_datapoints_audio + num_datapoints > self.max_datapoints:
                    num_datapoints = self.max_datapoints - current_datapoints_audio
                if num_datapoints > 0:
                    datapoints_i = audio_output[i][..., indexes[:num_datapoints]]. \
                        view(audio_output.shape[1], num_datapoints)
                    datapoints_audio[current_datapoints_audio:current_datapoints_audio + num_datapoints] = \
                        datapoints_i.transpose(1, 0).cpu().numpy()

                names_i = []
                for index in indexes[:num_datapoints]:
                    names_i.append((path[i], index))
                names_audio[current_datapoints_audio:current_datapoints_audio + num_datapoints] = names_i

                current_datapoints_audio += num_datapoints

                if current_datapoints_audio >= self.max_datapoints:
                    finish_audio = True

                if finish_image and finish_audio:
                    finish = True

        if current_datapoints_image < self.max_datapoints:
            datapoints_image = datapoints_image[:current_datapoints_image]

        if current_datapoints_audio < self.max_datapoints:
            datapoints_audio = datapoints_audio[:current_datapoints_audio]

        if current_datapoints_mul < self.max_datapoints:
            datapoints_mul = datapoints_mul[:current_datapoints_mul]

        self.datapoints_audio = datapoints_audio
        self.datapoints_image = datapoints_image
        self.datapoints_mul = datapoints_mul
        self.names_im = names_image
        self.names = names_audio

        return datapoints_image, names_image
Esempio n. 11
0
def repeated_attributes(trainer):
    """
    Here we check if the model is able of determining which images contain a specific attribute mentioned in the audio.
    It is the experiment reported in Table 1 in the paper.
    For each attribute, find 500 images with the attribute and 500 without (with the segmentor).
    We use the same list of images (for each attribute) for all the compared checkpoints.
    The audios with the repeated attributes are also always the same.
    """

    if not os.path.isdir(os.path.join(trainer.args.path_repeated_attributes, 'repetition_audios')):
        path_tar = os.path.join(trainer.args.path_repeated_attributes, 'repeated_attributes.tar.gz')
        wget.download('http://wednesday.csail.mit.edu/gaze/ganclevr/files/repetition_audios.tar.gz', out=path_tar)
        tf = tarfile.open(path_tar)
        tf.extractall(trainer.args.path_repeated_attributes)
        os.remove(path_tar)

    num_elements_each = 500
    path_paths = os.path.join(trainer.args.results, 'repeated_attributes', f'paths_{trainer.args.name_dataset}.pkl')
    list_attributes = ['RUBBER', 'METAL', 'CUBE', 'SPHERE', 'CYLINDER', 'LARGE', 'SMALL', 'GRAY', 'RED', 'BLUE',
                       'GREEN', 'BROWN', 'PURPLE', 'CYAN', 'YELLOW']

    # First step: get paths of images to test for the specific dataset
    print('Obtaining samples to compare')
    if not os.path.isfile(path_paths):
        j = 0
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        segment = segmenter.GroundTruthSegmenter(trainer.args.path_model_segmenter)

        counter_no_attribute = {word_attribute: 0 for word_attribute in list_attributes}
        counter_attribute = {word_attribute: 0 for word_attribute in list_attributes}

        # The key is the path, and the values are the attributes. This way, if a path (image) contains more than one
        # attribute, we can share the forward pass (much faster)
        paths_attributes = {}

        while ((np.array(list(counter_no_attribute.values())) < num_elements_each).any() or (
                np.array(list(counter_attribute.values())) < num_elements_each).any()) and \
                j < len(trainer.loaders['test'].dataset):
            p = trainer.loaders['test'].dataset.paths[j]
            j += 1

            raw_image = trainer.loaders['test'].dataset.load_image_raw(path=f'{p}')

            L = segment.get_pred(normalize(torch.tensor(raw_image).cuda().permute(2, 0, 1).float()/255), return_L=True)
            B = L >> 16
            G = (L - (B << 16)) >> 8
            R = (L - (B << 16) - (G << 8))
            pred_size = B >> 4  # - ids
            pred_shape = G >> 4
            pred_material = G - (pred_shape << 4)
            pred_color = R

            segmentation_keys = {'CUBE': [pred_shape, 1], 'SPHERE': [pred_shape, 2], 'CYLINDER': [pred_shape, 3],
                                 'RUBBER': [pred_material, 1], 'METAL': [pred_material, 2], 'LARGE': [pred_size, 1],
                                 'SMALL': [pred_size, 2], 'GRAY': [pred_color, 1], 'RED': [pred_color, 2],
                                 'BLUE': [pred_color, 3], 'GREEN': [pred_color, 4], 'BROWN': [pred_color, 5],
                                 'PURPLE': [pred_color, 6], 'CYAN': [pred_color, 7], 'YELLOW': [pred_color, 8]}

            exists = {}
            for word_attribute in list_attributes:
                no_size = word_attribute not in ['LARGE', 'SMALL']
                prob_exists = (segmentation_keys[word_attribute][0] == segmentation_keys[word_attribute][1]).sum()
                if prob_exists > 100 and prob_exists < (700 if no_size else 20000):  # otherwise can be noise
                    exists[word_attribute] = 1
                elif prob_exists < 10:  # to make sure it is not there (10 pixels is almost nothing)
                    exists[word_attribute] = -1
                else:
                    exists[word_attribute] = 0
                # Check if attribute in the image
                if exists[word_attribute] == 1 and counter_attribute[word_attribute] < num_elements_each:
                    counter_attribute[word_attribute] += 1
                    if p in paths_attributes:
                        paths_attributes[p].append([word_attribute, exists[word_attribute]])
                    else:
                        paths_attributes[p] = [[word_attribute, exists[word_attribute]]]
                elif exists[word_attribute] == -1 and counter_no_attribute[word_attribute] < num_elements_each:
                    counter_no_attribute[word_attribute] += 1
                    if p in paths_attributes:
                        paths_attributes[p].append([word_attribute, exists[word_attribute]])
                    else:
                        paths_attributes[p] = [[word_attribute, exists[word_attribute]]]

        os.makedirs(os.path.join(trainer.args.results, 'repeated_attributes'), exist_ok=True)
        with open(path_paths, 'wb') as f:
            pickle.dump(paths_attributes, f, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(path_paths, 'rb') as f:
            paths_attributes = pickle.load(f)

    # Second step: compute matching values for each image and audio in the list
    print('Computing matching values')
    synthetic = 'synth' in trainer.args.name_dataset

    results_checkpoint = {word_attribute: [] for word_attribute in list_attributes}

    # Load audio of all attributes and store in dict
    audio_features = {}
    for word_attribute in list_attributes:
        # Load audio of the word
        path_audio = os.path.join(trainer.args.path_repeated_attributes, 'repetition_audios',
                                  f'{word_attribute}_{"synthetic" if synthetic else "amt"}.wav')
        audio, nframes = trainer.loaders['test'].dataset.load_mel_spectrogram(path='', path_audio=path_audio)
        audio = audio.unsqueeze(0).unsqueeze(0).cuda()
        with torch.no_grad():
            audio_feat = trainer.model._modules['module'].model_audio.audio_model(audio)
        audio_features[word_attribute] = audio_feat

    # Load images and compute matchmaps
    for path, attributes in paths_attributes.items():
        with torch.no_grad():
            image = trainer.loaders['test'].dataset.load_image(path=path).unsqueeze(0).cuda()
            image_output = trainer.model._modules['module'].model_image.image_model(image)

        for word_attribute, ex in attributes:
            matchmap = utils.compute_matchmap(image_output[0], audio_features[word_attribute][0])  # all frames
            matchmap_max_h, _ = matchmap.max(0)
            matchmap_max_hw, _ = matchmap_max_h.max(0)
            matchmap_max_hw = matchmap_max_hw[4:-4]  # cut beginning and end
            value1 = matchmap_max_hw.mean()
            value2, _ = matchmap_max_hw.max(0)

            results_checkpoint[word_attribute].append([ex == 1, value1.cpu().numpy(), value2.cpu().numpy()])

    # Third step: compute final experiment value
    print('Computing final experiment value')
    diff_ = 0
    n_pairs_total = 0

    shape = [0, 0]
    color = [0, 0]
    size = [0, 0]
    material = [0, 0]

    for attribute, values in results_checkpoint.items():
        a = np.array(values)
        pos = a[np.where(a[:, 0] == 1)][:, 1]
        neg = a[np.where(a[:, 0] == 0)][:, 1]
        l = np.minimum(pos.shape[0], neg.shape[0])
        diff = (pos[:l] > neg[:l]).sum()
        n_pairs_total += l
        diff_ += diff

        if attribute in ['CUBE', 'SPHERE', 'CYLINDER']:
            shape[0] += diff
            shape[1] += l
        elif attribute in ['RUBBER', 'METAL', 'RUBBER', 'METAL']:
            material[0] += diff
            material[1] += l
        elif attribute in ['LARGE', 'SMALL']:
            size[0] += diff
            size[1] += l
        elif attribute in ['GRAY', 'RED', 'BLUE', 'GREEN', 'BROWN', 'CYAN', 'PURPLE', 'YELLOW']:
            color[0] += diff
            color[1] += l

    print('')
    print(f'Color: {color[0]/color[1]:0.03f}')
    print(f'Material: {material[0]/material[1]:0.03f}')
    print(f'Size: {size[0]/size[1]:0.03f}')
    print(f'Shape: {shape[0]/shape[1]:0.03f}')
    print(f'Mean: {(shape[0]/shape[1] + color[0]/color[1] + size[0]/size[1] + material[0]/material[1])/4:0.03f}')
Esempio n. 12
0
def test_recall(trainer):
    """
    This experiment computes one positive and num_fakes corresponding negatives with the GAN, gives the recall of the
    system in distinguishing the positive from the negatives.
    Similar to the test_recall_selected, but selecting the negatives online, not using the pre-selected (and better)
    negatives.
    """

    number_recall = 200
    num_fakes = 9
    if not trainer.args.use_cpu:
        torch.cuda.synchronize()

    recall1_meter = utils.AverageMeter()
    recall5_meter = utils.AverageMeter()

    # Switch to evaluate mode
    trainer.model.eval()

    with torch.no_grad():
        for i, (image_input, audio_input, negatives, nframes, path, _) in enumerate(trainer.loaders['test']):
            if i % 50 == 0:
                print(f'Starting batch {i}')
            if i * image_input.size(0) > number_recall:
                break

            for j in range(image_input.size(0)):

                score_vector = torch.FloatTensor(num_fakes + 1)

                if not trainer.args.loading_image:
                    v_init = trainer.z[0]
                    z_img = torch.FloatTensor(1, v_init.shape[0])
                    z_img[0, :] = trainer.z[int(path[j])]
                    image_input = trainer.generator.generate_images(z_img, intervention=None)
                    image_input = utils.transform(image_input)
                else:
                    image_input = image_input.cuda()

                pos_image = image_input[0, :, :, :]
                model_output = trainer.model(image_input, audio_input, [])
                image_output = model_output[0]
                audio_output = model_output[1]
                nF = nframes[j]

                matchmap = utils.compute_matchmap(image_output[0], audio_output[0][:, :, :nF])

                real_score = utils.matchmap_sim(matchmap)
                score_vector[0] = real_score
                matchmap = matchmap.data.cpu().numpy().copy()
                matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                matchmap = matchmap / matchmap.max()
                matchmap_image = matchmap.max(axis=0)

                threshold = 0.95

                ind_max = np.argmax(matchmap)
                ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) // matchmap.shape[1]
                ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) % matchmap.shape[1]

                for fake_id in range(num_fakes):
                    binary_mask = matchmap_image > (threshold * matchmap_image.max())
                    binary_mask = utils.geodesic_dilation(binary_mask, (ind_h, ind_w))
                    norm = 0
                    threshold_random = 0.95
                    p = 0.4

                    while norm < threshold_random:
                        with torch.no_grad():
                            intervention = {}
                            for layer_n in trainer.layer_list_all:
                                layer_size = trainer.layers_dict[layer_n]['size']
                                layer_dim = trainer.layers_dict[layer_n]['depth']

                                ablation, replacement = trainer.get_ablation_replacement(params=[layer_dim, True, p],
                                                                                         option='random')
                                ablation_final = cv2.resize(binary_mask, (layer_size, layer_size))
                                ablation_final = np.tile(ablation_final, (layer_dim, 1, 1)).astype(np.float32)
                                ablation_final = torch.cuda.FloatTensor(ablation_final)
                                ablation_final = ablation.view(layer_dim, 1, 1).expand_as(
                                    ablation_final) * ablation_final
                                intervention[layer_n] = (ablation_final, replacement)

                            z_img = trainer.z[int(path[j])]
                            z_img = z_img[np.newaxis, :].detach()
                            neg_img = trainer.generator.generate_images(z_img, intervention=intervention).detach()
                            neg_img_t = utils.transform(neg_img).detach()

                            binary_mask = cv2.resize(binary_mask, (128, 128))

                            bmask = torch.Tensor(binary_mask).cuda()

                            bmask = bmask.view(1, 128, 128).expand(3, 128, 128)
                            norm = (neg_img_t[0, :, :, :] - pos_image[:, :, :].detach())

                            norm = norm * bmask
                            norm = torch.norm(torch.norm(torch.norm(norm, dim=2), dim=1), dim=0)
                            norm_normalized = norm / torch.norm(
                                torch.norm(torch.norm(pos_image[:, :, :].detach() * bmask, dim=2), dim=1), dim=0)
                            norm = norm_normalized.item()

                            if random.random() > 0.2:
                                p = p + 0.05
                            else:
                                threshold_random = threshold_random - 0.01
                    model_output = trainer.model(neg_img_t, audio_input, [])
                    image_output = model_output[0]
                    audio_output = model_output[1]
                    score_vector[1 + fake_id] = utils.matchmap_sim(
                        utils.compute_matchmap(image_output[0], audio_output[0][:, :, :nF]))

                _, ids = score_vector.topk(10)
                ids = ids.cpu().numpy()
                ids = np.where(ids == 0)[0]

                A_foundind = ids[0]

                if A_foundind == 0:
                    recall1_meter.update(1)
                else:
                    recall1_meter.update(0)

                if A_foundind < 5:
                    recall5_meter.update(1)
                else:
                    recall5_meter.update(0)
                # print('Recall 1: {0}'.format(recall1_meter.avg))
                # print('Recall 5: {0}'.format(recall5_meter.avg))

    print('Recall 1: {0}'.format(recall1_meter.avg))
    print('Recall 5: {0}'.format(recall5_meter.avg))

    return recall1_meter.avg
Esempio n. 13
0
def create_videos(trainer):
    """
    Create videos for visualizing. Will generate a video for each sample, so cancel when you have enough videos.
    For this experiment, you need to have the images downloaded.
    """
    if not trainer.args.use_cpu:
        torch.cuda.synchronize()

    # Switch to evaluate mode
    trainer.model.eval()

    len_audio = 20.48  # Only if target_spec_length = 2048
    folder_name = os.path.join(trainer.args.results, 'results_video', trainer.args.name_checkpoint)
    os.makedirs(folder_name, exist_ok=True)

    with torch.no_grad():
        for i, (image_input, audio_input, negatives, nframes, path, _) in enumerate(trainer.loaders['test']):

            v_init = trainer.z[int(path[0])]
            z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0])

            for k in range(audio_input.size(0)):
                z_img[k, :] = trainer.z[int(path[k])]

            if not trainer.args.loading_image:
                image_input = trainer.generator.generate_images(z_img, intervention=None)
                image_input = utils.transform(image_input).detach()

            # compute output
            model_output = trainer.model(image_input, audio_input, [])
            image_output = model_output[0]
            audio_output = model_output[1]

            pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
            nframes.div_(pooling_ratio)

            fps = audio_output.size(3) / len_audio

            for bs in range(image_output.size(0)):

                try:
                    target_writer = imageio.get_writer(folder_name + f'/output_video_{path[bs]}.mp4', fps=fps)
                    matchmap = utils.compute_matchmap(image_output[bs],
                                                      audio_output[bs][:, :, 0:nframes[bs]]).data.cpu().numpy().copy()
                    wav = trainer.loaders['test'].dataset.load_audio_raw(path=path[bs])
                    scipy.io.wavfile.write(folder_name + f'/output_audio_{path[bs]}.mp3', 44100,
                                           wav.astype(np.int16))

                    matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                    matchmap = matchmap / matchmap.sum()
                    matchmap_l, matchmap_h, matchmap_w = matchmap.shape
                    k_ranges = utils.frange(np.max(matchmap) / 100, np.max(matchmap), np.max(matchmap) / 100)

                    for k in k_ranges:
                        binary_mask = matchmap > k
                        map_temp = np.multiply(matchmap, binary_mask)
                        if np.sum(map_temp) < 0.1:
                            break

                    smoothing_factor = 1
                    struct_element = [[[True]]] * smoothing_factor
                    binary_mask = morph.binary_dilation(binary_mask, struct_element)  # Temporal smoothing
                    matchmap = np.multiply(matchmap, binary_mask)
                    matchmap = (matchmap - np.min(matchmap)) / (np.max(matchmap) - np.min(matchmap))

                    image = trainer.loaders['test'].dataset.load_image_raw(path=path[bs])

                    for t in range(matchmap_l):
                        mask_resize = np.array([cv2.resize(binary_mask[t, :, :].astype(float),
                                                           (image.shape[1], image.shape[0]))] * 3).transpose(1, 2, 0)
                        map_t = cv2.resize(matchmap[t, :, :], (image.shape[1], image.shape[0]))
                        map_t = 1 - map_t
                        map_t = 255 * map_t
                        map_t = map_t.astype(np.uint8)
                        map_t = cv2.applyColorMap(map_t, cv2.COLORMAP_JET)

                        im_final = np.multiply((0.3 * image + 0.7 * map_t), mask_resize) + np.multiply(image,
                                                                                                       1 - mask_resize)

                        target_writer.append_data(im_final)

                    target_writer.close()
                    # -y means overwrite

                    os.system('ffmpeg -y -i ' +
                              folder_name + f'/output_video_{path[bs]}.mp4 -i ' +
                              folder_name + f'/output_audio_{path[bs]}.mp3 -vf scale=1200:1200 -shortest -strict -2 '
                                            '-c:v libx264 ' +
                              folder_name + f'/video_{path[bs]}.mp4')
                except KeyboardInterrupt as e:
                    print('you decided to finish!')

                finally:
                    # Remove temporary files
                    try:
                        os.remove(folder_name + f'/output_video_{path[bs]}.mp4')
                    except OSError:
                        pass
                    try:
                        os.remove(folder_name + f'/output_audio_{path[bs]}.mp3')
                    except OSError:
                        pass

    return False
def generate_active_learning(trainer):
    """
    Generate active learning samples, selecting the positive/negative pairs in which the model has the highest error
    The saved information is:
    - Current clusters. They are necessary to generate the same negative (as they select the mask)
    - jpg negative images. Only needed to get the captions. Not needed for training (as they will be GAN-generated)
    - Information to generate the negative images (masks and units, and the associated paths)

    This information is saved in {args.active_learning_path}/{trainer.args.name_checkpoint}_{str(time.time())}/

    When training with these images, it is recommended to start from the same checkpoint used for obtaining them.

    After this, the next steps before running again the system with the new samples are:
    - Collecting captions of the active learning samples (the negatives; the positives already have captions)
    - Adding the new collected samples to the dataset. The can be added in a separate folder, and only modifying the
    name_list_{}.txt files is enough. Note that the noise ID (and thus the name of the file) will already exist (for the
    positive one), so save the new ones in an "active" subfolder.
    """

    assert len(
        trainer.layer_list_all
    ) == 1, 'Active learning is only implemented for a single layer ablations'

    trainer.clusterer.save_results = True
    clus, mean_clust, std_clust, _ = trainer.clusterer.create_clusters(
        iteration=0)
    trainer.clusters = torch.FloatTensor(clus).cuda()
    trainer.mean_clust = torch.FloatTensor(mean_clust)
    trainer.std_clust = torch.FloatTensor(std_clust)
    trainer.cluster_counts = 1 / trainer.clusters.max(1)[0]
    trainer.clusters_unit = trainer.cluster_counts.view(trainer.clusters.size(0), 1).expand_as(trainer.clusters) * \
                            trainer.clusters

    trainer.clusterer.name_with_images_clusters()
    trainer.clusterer.name_clusters()

    trainer.optimize_neurons()

    if not trainer.args.use_cpu:
        torch.cuda.synchronize()
    data_time = utils.AverageMeter()

    # Switch to train mode
    trainer.model.eval()

    active_learning_name = os.path.join(
        trainer.args.active_learning_path,
        f'{trainer.args.name_checkpoint}_{str(time.time())}')

    end = time.time()

    all_loss = []
    all_hmap = []
    all_hmap_eval = []
    all_units = []
    all_paths = []

    for batch_id, (image_input, audio_input, neg_images, nframes, path, image_raw) in \
            enumerate(trainer.loaders['train']):
        print(batch_id)

        # Measure data loading time
        data_time.update(time.time() - end)

        if not trainer.args.use_cpu:
            audio_input = audio_input.cuda(async=True)

        if not trainer.args.loading_image:
            if trainer.args.active_learning:
                path_ints = [p.split('/')[-1] for p in path]
            else:
                path_ints = path

            v_init = trainer.z[int(path_ints[0])]
            z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0])

            for k in range(image_input.size(0)):
                z_img[k, :] = trainer.z[int(path_ints[k])]

            image_input = trainer.generator.generate_images(z_img,
                                                            intervention=None)
            image_input = utils.transform(image_input).detach()

        else:
            image_input = image_input.cuda()
            neg_images = neg_images.cuda()

        model_output = trainer.model(image_input, audio_input, [])
        image_output = model_output[0]
        audio_output = model_output[1]

        neg_images = []

        pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
        nframes.div_(pooling_ratio)

        binary_mask_0 = None

        if trainer.loss_type == 'negatives_edited' or trainer.loss_type == 'negatives_both':
            limits = np.zeros((image_input.size(0), 2))

            for i in range(image_input.size(0)):
                pos_image = image_input[i, :, :, :]

                nF = nframes[i]

                matchmap = utils.compute_matchmap(image_output[i],
                                                  audio_output[i][:, :, :nF])

                positive_score = utils.matchmap_sim(matchmap).detach()
                matchmap = matchmap.data.cpu().numpy().copy()

                matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                matchmap = matchmap / (matchmap.max() + 1e-10)
                matchmap_image = matchmap.max(axis=0)
                threshold = 0.95

                # ind_max = np.argmax(matchmap_image)
                ind_max = np.argmax(matchmap)
                ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1])
                ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                         ) // matchmap.shape[1]
                ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                         ) % matchmap.shape[1]

                limits[i, 0] = ind_t
                limits[i, 1] = ind_t + 1

                v = (image_output[i][:, ind_h, ind_w] -
                     trainer.mean_clust.cuda()) / (trainer.std_clust.cuda() +
                                                   1e-8)

                normalized_clusters = np.matmul(
                    trainer.clusters.cpu(),
                    v.detach().cpu().numpy().transpose())
                sorted_val = -np.sort(-normalized_clusters[:])
                sorted_val = np.clip(sorted_val, 0, 4)
                prob_samples = sorted_val / np.sum(sorted_val)
                sorted_id = np.argsort(-normalized_clusters[:])
                cluster_id = sorted_id[0]

                norm = 0
                threshold_random = 0.95

                # The number of units to be ablated grows if we cannot generate a good (changed) negative
                # The following numbers are the starting number of units to change
                num_units_dict = {
                    'layer2': 30,
                    'layer3': 30,
                    'layer4': 140,
                    'layer5': 30,
                    'layer6': 30
                }
                thresold_heatmap = threshold

                count = 0
                binary_mask_eval = matchmap_image > (thresold_heatmap *
                                                     matchmap_image.max())
                binary_mask_eval = utils.geodesic_dilation(
                    binary_mask_eval, (ind_h, ind_w))
                binary_mask_eval = cv2.resize(binary_mask_eval, (128, 128))
                all_hmap_eval.append(binary_mask_eval)
                bmask = torch.Tensor(binary_mask_eval).cuda()
                bmask = bmask.view(1, 128, 128).expand(3, 128, 128)
                all_paths.append(path_ints[i])

                while norm < threshold_random:
                    with torch.no_grad():
                        binary_mask = matchmap_image > (thresold_heatmap *
                                                        matchmap_image.max())
                        binary_mask = utils.geodesic_dilation(
                            binary_mask, (ind_h, ind_w))

                        if binary_mask_0 is None:
                            binary_mask_0 = cv2.resize(binary_mask, (224, 224))

                        z_img = trainer.z[int(path_ints[i])]
                        z_img = z_img[np.newaxis, :]

                        _ = trainer.generator.generate_images(z_img)
                        intervention = {}
                        for layer_n in trainer.layer_list_all:  # This will only be one layer
                            units_ids = trainer.layers_units[layer_n][
                                cluster_id][:num_units_dict[layer_n]]
                            layer_size = trainer.layers_dict[layer_n]['size']
                            layer_dim = trainer.layers_dict[layer_n]['depth']

                            ablation, replacement = trainer.get_ablation_replacement(
                                params=[layer_dim, units_ids],
                                option='specific')
                            ablation_final = cv2.resize(
                                binary_mask, (layer_size, layer_size))
                            ablation_final = np.tile(ablation_final,
                                                     (layer_dim, 1, 1)).astype(
                                                         np.float32)
                            ablation_final = torch.cuda.FloatTensor(
                                ablation_final)
                            ablation_final = ablation.view(
                                layer_dim, 1,
                                1).expand_as(ablation_final) * ablation_final
                            intervention[layer_n] = (ablation_final,
                                                     replacement)

                        neg_img = trainer.generator.generate_images(
                            z_img, intervention=intervention).detach()
                        neg_img_t = utils.transform(neg_img).detach()

                        binary_mask = cv2.resize(binary_mask, (128, 128))
                        norm = (neg_img_t[0, :, :, :] - pos_image.detach())
                        norm_im = torch.norm(norm, dim=0)
                        norm = norm * bmask
                        im_dif = norm
                        norm = torch.norm(torch.norm(torch.norm(norm, dim=2),
                                                     dim=1),
                                          dim=0)
                        norm_normalized = norm / torch.norm(torch.norm(
                            torch.norm(pos_image.detach() * bmask, dim=2),
                            dim=1),
                                                            dim=0)
                        norm = norm_normalized.item()
                        for layer_n in trainer.layer_list_all:
                            num_units_dict[layer_n] = num_units_dict[
                                layer_n] + 40  # increase units to change
                        thresold_heatmap = thresold_heatmap - 0.1

                        threshold_random = threshold_random - 0.05

                        cluster_id = np.random.choice(sorted_id,
                                                      size=(1),
                                                      p=prob_samples)[0]

                        count = count + 1

                neg_images.append(neg_img.detach())
                all_hmap.append(binary_mask)
                all_units.append(units_ids)

            neg_images = torch.cat(neg_images)
            neg_images_t = utils.transform(neg_images)

            image_output_neg, _, _ = trainer.model(neg_images_t, None, [])

            loss_list = trainer.vectorized_negatives_loss(
                image_output, audio_output, image_output_neg, nframes)
            loss_list = [x.detach() for x in loss_list]
            all_loss.extend(loss_list)

    all_loss = [x.view(1, -1) for x in all_loss]
    all_loss = torch.cat(all_loss).view(-1, 1)
    _, ind = all_loss.topk(3000, 0)
    ind = [x.item() for x in ind]

    a_units = [all_units[i] for i in ind]
    a_paths = [all_paths[i] for i in ind]
    a_hmaps = [all_hmap[i] for i in ind]
    a_hmaps_eval = [all_hmap_eval[i] for i in ind]

    torch.save(a_units, os.path.join(active_learning_name, 'units.pth'))
    torch.save(a_paths, os.path.join(active_learning_name, 'a_paths.pth'))
    torch.save(a_hmaps, os.path.join(active_learning_name, 'a_hmaps.pth'))
    torch.save(a_hmaps_eval,
               os.path.join(active_learning_name, 'a_hmaps_eval.pth'))

    os.makedirs(os.path.join(active_learning_name, 'images'), exist_ok=True)
    os.makedirs(os.path.join(active_learning_name, 'hm'), exist_ok=True)

    for j in range(len(a_units)):
        path = a_paths[j]
        units_ids = a_units[j]
        binary_mask = a_hmaps[j]
        layer_n = trainer.layer_list_all[0]
        layer_size = trainer.layers_dict[layer_n]['size']
        layer_dim = trainer.layers_dict[layer_n]['depth']
        intervention = {}
        ablation, replacement = trainer.get_ablation_replacement(
            params=[layer_dim, units_ids], option='specific')
        ablation_final = cv2.resize(binary_mask, (layer_size, layer_size))
        ablation_final = np.tile(ablation_final,
                                 (layer_dim, 1, 1)).astype(np.float32)
        ablation_final = torch.cuda.FloatTensor(ablation_final)
        ablation_final = ablation.view(
            layer_dim, 1, 1).expand_as(ablation_final) * ablation_final
        intervention[layer_n] = (ablation_final, replacement)
        z_img = trainer.z[int(path)]
        z_img = z_img[np.newaxis, :]
        neg_img = trainer.generator.generate_images(
            z_img, intervention=intervention).detach()
        neg_im = neg_img[0, :, :, :].cpu().numpy().transpose(1, 2, 0)
        neg_im = neg_im.astype(np.uint8)
        neg_im = Image.fromarray(neg_im.astype('uint8'), 'RGB')

        draw = ImageDraw.Draw(neg_im)
        hm = a_hmaps_eval[j]
        rows = np.any(hm, axis=1)
        cols = np.any(hm, axis=0)
        rmin, rmax = np.where(rows)[0][[0, -1]]
        cmin, cmax = np.where(cols)[0][[0, -1]]
        draw.rectangle(((cmin, rmin), (cmax, rmax)), outline='red')

        neg_im.save(os.path.join(active_learning_name, 'images',
                                 f'{j}_hn.jpg'))
        binary_mask_eval = cv2.resize(a_hmaps_eval[j], (128, 128))
        mask_im = binary_mask_eval * 255
        mask_im = mask_im.astype(np.uint8)
        mask_im = mask_im.reshape((128, 128, 1))
        mask_im = np.concatenate((mask_im, mask_im, mask_im), axis=2)

        mask_im = Image.fromarray(mask_im.astype('uint8'), 'RGB')
        mask_im.save(os.path.join(active_learning_name, 'hm', f'{j}_hn.jpg'))
Esempio n. 15
0
    def name_with_images_clusters(self):
        """
        Find representatives for the clusters, now with images. Simply for visualization purposes.
        :return: list of dicts. For each cluster, 5 key-value pairs in the dict {path: mask}
        """

        if not self.load['name_final']:  # We use the same flag
            assert self.centroids is not None
            assert self.names_im is not None
            assert self.datapoints_image is not None

            datapoints = self.datapoints_image
            values = np.matmul(self.centroids.astype(float),
                               datapoints.transpose(1, 0))

            num_images_per_cluster = 100

            return_vector = []
            audio_output = torch.FloatTensor(self.centroids).cuda()
            audio_output = audio_output.transpose(1, 0).view(
                self.centroids.shape[1], 1, self.centroids.shape[0])

            with torch.no_grad():
                for c in range(self.num_clusters):
                    dict_c = {}
                    max_images_indexes = np.argsort(-values[c])
                    for i in range(num_images_per_cluster):
                        count = 0
                        im_index = max_images_indexes[i]
                        path, index = self.names_im[im_index]

                        while path in dict_c:
                            count = count + 1
                            im_index = max_images_indexes[i + count]
                            path, index = self.names_im[im_index]

                        path_ints = path.split('/')[
                            -1]  # in case the audio is inside a subfolder

                        v_init = self.z[int(path_ints)]
                        z_img = torch.FloatTensor(1, v_init.shape[0])
                        z_img[0, :] = v_init

                        image_input = self.generator.generate_images(
                            z_img, intervention=None)
                        image_input = utils.transform(image_input)

                        model_output = self.model(image_input, None, [])
                        image_output = model_output[0]
                        mask = utils.compute_matchmap(
                            image_output[0], audio_output).cpu().numpy()[:, :,
                                                                         c]
                        th = 0.64
                        binary_mask = mask > mask.max() * th
                        per = binary_mask.astype(float).sum() / 64.0
                        while per < 0.2:
                            binary_mask = mask > mask.max() * th
                            per = binary_mask.astype(float).sum() / 64.0
                            th = th - 0.02

                        dict_c[path] = binary_mask

                    return_vector.append(dict_c)

            if self.save_results:
                torch.save(
                    return_vector,
                    os.path.join(self.path_store, f'names_images.pth.tar'))

        else:
            return_vector = torch.load(
                os.path.join(self.path_store, f'names_images.pth.tar'))

        self.names_images = return_vector
        return return_vector