Python draw_labelmap Exemples, utils.imutils.draw_labelmap Python Exemples

Exemple #1

0

Afficher le fichier

    def __getitem__(self, index):
        sf = self.scale_factor
        rf = self.rot_factor
        if self.is_train:
            a = self.anno[self.train_list[index]]
        else:
            a = self.anno[self.valid_list[index]]

        img_path = os.path.join(self.img_folder, a['img_paths'])
        pts = torch.Tensor(a['joint_self'])
        # pts[:, 0:2] -= 1  # Convert pts to zero based

        # c = torch.Tensor(a['objpos']) - 1
        c = torch.Tensor(a['objpos'])
        s = a['scale_provided']

        # Adjust center/scale slightly to avoid cropping limbs
        if c[0] != -1:
            c[1] = c[1] + 15 * s
            s = s * 1.25

        # For single-person pose estimation with a centered/scaled figure
        nparts = pts.size(0)
        img = load_image(img_path)  # CxHxW

        r = 0
        if self.is_train:
            s = s * torch.randn(1).mul_(sf).add_(1).clamp(1 - sf, 1 + sf)[0]
            r = torch.randn(1).mul_(rf).clamp(
                -2 * rf, 2 * rf)[0] if random.random() <= 0.6 else 0

            # Flip
            if random.random() <= 0.5:
                img = fliplr(img)
                pts = shufflelr(pts, img.size(2), self.DATA_INFO.hflip_indices)
                c[0] = img.size(2) - c[0]

            # Color
            img[0, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)
            img[1, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)
            img[2, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)

        # Prepare image and groundtruth map
        inp = crop(img, c, s, self.inp_res, rot=r)
        inp = color_normalize(inp, self.DATA_INFO.rgb_mean,
                              self.DATA_INFO.rgb_stddev)

        # Generate ground truth
        tpts = pts.clone()
        target = torch.zeros(nparts, *self.out_res)
        target_weight = tpts[:, 2].clone().view(nparts, 1)

        for i in range(nparts):
            # if tpts[i, 2] > 0: # This is evil!!
            if tpts[i, 1] > 0:
                tpts[i, 0:2] = to_torch(
                    transform(tpts[i, 0:2] + 1, c, s, self.out_res, rot=r))
                target[i], vis = draw_labelmap(target[i],
                                               tpts[i] - 1,
                                               self.sigma,
                                               type=self.label_type)
                target_weight[i, 0] *= vis

        # Meta info
        if not isinstance(s, torch.Tensor):
            s = torch.Tensor(s)

        meta = {
            'index': index,
            'center': c,
            'scale': s,
            'pts': pts,
            'tpts': tpts,
            'target_weight': target_weight
        }

        return inp, target, meta

Exemple #2

0

Afficher le fichier

Fichier : dataset.py Projet : darylperalta/attention-target-detection

    def __getitem__(self, index):
        sequence_path = self.all_sequence_paths[index]
        df = pd.read_csv(
            sequence_path,
            header=None,
            index_col=False,
            names=['path', 'xmin', 'ymin', 'xmax', 'ymax', 'gazex', 'gazey'])
        show_name = sequence_path.split('/')[-3]
        clip = sequence_path.split('/')[-2]
        seq_len = len(df.index)

        # moving-avg smoothing
        window_size = 11  # should be odd number
        df['xmin'] = myutils.smooth_by_conv(window_size, df, 'xmin')
        df['ymin'] = myutils.smooth_by_conv(window_size, df, 'ymin')
        df['xmax'] = myutils.smooth_by_conv(window_size, df, 'xmax')
        df['ymax'] = myutils.smooth_by_conv(window_size, df, 'ymax')

        if not self.test:
            # cond for data augmentation
            cond_jitter = np.random.random_sample()
            cond_flip = np.random.random_sample()
            cond_color = np.random.random_sample()
            if cond_color < 0.5:
                n1 = np.random.uniform(0.5, 1.5)
                n2 = np.random.uniform(0.5, 1.5)
                n3 = np.random.uniform(0.5, 1.5)
            cond_crop = np.random.random_sample()

            # if longer than seq_len_limit, cut it down to the limit with the init index randomly sampled
            if seq_len > self.seq_len_limit:
                sampled_ind = np.random.randint(0,
                                                seq_len - self.seq_len_limit)
                seq_len = self.seq_len_limit
            else:
                sampled_ind = 0

            if cond_crop < 0.5:
                sliced_x_min = df['xmin'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_x_max = df['xmax'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_y_min = df['ymin'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_y_max = df['ymax'].iloc[sampled_ind:sampled_ind +
                                               seq_len]

                sliced_gaze_x = df['gazex'].iloc[sampled_ind:sampled_ind +
                                                 seq_len]
                sliced_gaze_y = df['gazey'].iloc[sampled_ind:sampled_ind +
                                                 seq_len]

                check_sum = sliced_gaze_x.sum() + sliced_gaze_y.sum()
                all_outside = check_sum == -2 * seq_len

                # Calculate the minimum valid range of the crop that doesn't exclude the face and the gaze target
                if all_outside:
                    crop_x_min = np.min(
                        [sliced_x_min.min(),
                         sliced_x_max.min()])
                    crop_y_min = np.min(
                        [sliced_y_min.min(),
                         sliced_y_max.min()])
                    crop_x_max = np.max(
                        [sliced_x_min.max(),
                         sliced_x_max.max()])
                    crop_y_max = np.max(
                        [sliced_y_min.max(),
                         sliced_y_max.max()])
                else:
                    crop_x_min = np.min([
                        sliced_gaze_x.min(),
                        sliced_x_min.min(),
                        sliced_x_max.min()
                    ])
                    crop_y_min = np.min([
                        sliced_gaze_y.min(),
                        sliced_y_min.min(),
                        sliced_y_max.min()
                    ])
                    crop_x_max = np.max([
                        sliced_gaze_x.max(),
                        sliced_x_min.max(),
                        sliced_x_max.max()
                    ])
                    crop_y_max = np.max([
                        sliced_gaze_y.max(),
                        sliced_y_min.max(),
                        sliced_y_max.max()
                    ])

                # Randomly select a random top left corner
                if crop_x_min >= 0:
                    crop_x_min = np.random.uniform(0, crop_x_min)
                if crop_y_min >= 0:
                    crop_y_min = np.random.uniform(0, crop_y_min)

                # Get image size
                path = os.path.join(self.data_dir, show_name, clip,
                                    df['path'].iloc[0])
                img = Image.open(path)
                img = img.convert('RGB')
                width, height = img.size

                # Find the range of valid crop width and height starting from the (crop_x_min, crop_y_min)
                crop_width_min = crop_x_max - crop_x_min
                crop_height_min = crop_y_max - crop_y_min
                crop_width_max = width - crop_x_min
                crop_height_max = height - crop_y_min
                # Randomly select a width and a height
                crop_width = np.random.uniform(crop_width_min, crop_width_max)
                crop_height = np.random.uniform(crop_height_min,
                                                crop_height_max)
        else:
            sampled_ind = 0


        faces, images, head_channels, heatmaps, paths, gazes, imsizes, gaze_inouts = [], [], [], [], [], [], [], []
        index_tracker = -1
        for i, row in df.iterrows():
            index_tracker = index_tracker + 1
            if not self.test:
                if index_tracker < sampled_ind or index_tracker >= (
                        sampled_ind + self.seq_len_limit):
                    continue

            face_x1 = row['xmin']  # note: Already in image coordinates
            face_y1 = row['ymin']  # note: Already in image coordinates
            face_x2 = row['xmax']  # note: Already in image coordinates
            face_y2 = row['ymax']  # note: Already in image coordinates
            gaze_x = row['gazex']  # note: Already in image coordinates
            gaze_y = row['gazey']  # note: Already in image coordinates

            impath = os.path.join(self.data_dir, show_name, clip, row['path'])
            img = Image.open(impath)
            img = img.convert('RGB')

            width, height = img.size
            imsize = torch.FloatTensor([width, height])
            # imsizes.append(imsize)

            face_x1, face_y1, face_x2, face_y2 = map(
                float, [face_x1, face_y1, face_x2, face_y2])
            gaze_x, gaze_y = map(float, [gaze_x, gaze_y])
            if gaze_x == -1 and gaze_y == -1:
                gaze_inside = False
            else:
                if gaze_x < 0:  # move gaze point that was sliglty outside the image back in
                    gaze_x = 0
                if gaze_y < 0:
                    gaze_y = 0
                gaze_inside = True

            if not self.test:
                ## data augmentation
                # Jitter (expansion-only) bounding box size.
                if cond_jitter < 0.5:
                    k = cond_jitter * 0.1
                    face_x1 -= k * abs(face_x2 - face_x1)
                    face_y1 -= k * abs(face_y2 - face_y1)
                    face_x2 += k * abs(face_x2 - face_x1)
                    face_y2 += k * abs(face_y2 - face_y1)
                    face_x1 = np.clip(face_x1, 0, width)
                    face_x2 = np.clip(face_x2, 0, width)
                    face_y1 = np.clip(face_y1, 0, height)
                    face_y2 = np.clip(face_y2, 0, height)

                # Random Crop
                if cond_crop < 0.5:
                    # Crop it
                    img = TF.crop(img, crop_y_min, crop_x_min, crop_height,
                                  crop_width)

                    # Record the crop's (x, y) offset
                    offset_x, offset_y = crop_x_min, crop_y_min

                    # convert coordinates into the cropped frame
                    face_x1, face_y1, face_x2, face_y2 = face_x1 - offset_x, face_y1 - offset_y, face_x2 - offset_x, face_y2 - offset_y
                    if gaze_inside:
                        gaze_x, gaze_y = (gaze_x- offset_x), \
                                         (gaze_y - offset_y)
                    else:
                        gaze_x = -1
                        gaze_y = -1

                    width, height = crop_width, crop_height

                # Flip?
                if cond_flip < 0.5:
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                    x_max_2 = width - face_x1
                    x_min_2 = width - face_x2
                    face_x2 = x_max_2
                    face_x1 = x_min_2
                    if gaze_x != -1 and gaze_y != -1:
                        gaze_x = width - gaze_x

                # Random color change
                if cond_color < 0.5:
                    img = TF.adjust_brightness(img, brightness_factor=n1)
                    img = TF.adjust_contrast(img, contrast_factor=n2)
                    img = TF.adjust_saturation(img, saturation_factor=n3)

            # Face crop
            face = img.copy().crop(
                (int(face_x1), int(face_y1), int(face_x2), int(face_y2)))

            # Head channel image
            head_channel = imutils.get_head_box_channel(
                face_x1,
                face_y1,
                face_x2,
                face_y2,
                width,
                height,
                resolution=self.input_size,
                coordconv=False).unsqueeze(0)
            if self.transform is not None:
                img = self.transform(img)
                face = self.transform(face)

            # Deconv output
            if gaze_inside:
                gaze_x /= float(width)  # fractional gaze
                gaze_y /= float(height)
                gaze_heatmap = torch.zeros(
                    self.output_size,
                    self.output_size)  # set the size of the output
                gaze_map = imutils.draw_labelmap(
                    gaze_heatmap,
                    [gaze_x * self.output_size, gaze_y * self.output_size],
                    3,
                    type='Gaussian')
                gazes.append(torch.FloatTensor([gaze_x, gaze_y]))
            else:
                gaze_map = torch.zeros(self.output_size, self.output_size)
                gazes.append(torch.FloatTensor([-1, -1]))
            faces.append(face)
            images.append(img)
            head_channels.append(head_channel)
            heatmaps.append(gaze_map)
            gaze_inouts.append(torch.FloatTensor([int(gaze_inside)]))

        if self.imshow:
            for i in range(len(faces)):
                fig = plt.figure(111)
                img = 255 - imutils.unnorm(images[i].numpy()) * 255
                img = np.clip(img, 0, 255)
                plt.imshow(np.transpose(img, (1, 2, 0)))
                plt.imshow(imresize(heatmaps[i],
                                    (self.input_size, self.input_size)),
                           cmap='jet',
                           alpha=0.3)
                plt.imshow(imresize(1 - head_channels[i].squeeze(0),
                                    (self.input_size, self.input_size)),
                           alpha=0.2)
                plt.savefig(
                    os.path.join('debug',
                                 'viz_%d_inout=%d.png' % (i, gaze_inouts[i])))
                plt.close('all')

        faces = torch.stack(faces)
        images = torch.stack(images)
        head_channels = torch.stack(head_channels)
        heatmaps = torch.stack(heatmaps)
        gazes = torch.stack(gazes)
        gaze_inouts = torch.stack(gaze_inouts)
        # imsizes = torch.stack(imsizes)
        # print(faces.shape, images.shape, head_channels.shape, heatmaps.shape)

        if self.test:
            return images, faces, head_channels, heatmaps, gazes, gaze_inouts
        else:  # train
            return images, faces, head_channels, heatmaps, gaze_inouts

Exemple #3

0

Afficher le fichier

Fichier : dataset.py Projet : darylperalta/attention-target-detection

    def __getitem__(self, index):
        if self.test:
            g = self.X_test.get_group(self.keys[index])
            cont_gaze = []
            for i, row in g.iterrows():
                path = row['path']
                x_min = row['bbox_x_min']
                y_min = row['bbox_y_min']
                x_max = row['bbox_x_max']
                y_max = row['bbox_y_max']
                eye_x = row['eye_x']
                eye_y = row['eye_y']
                gaze_x = row['gaze_x']
                gaze_y = row['gaze_y']
                cont_gaze.append([gaze_x, gaze_y
                                  ])  # all ground truth gaze are stacked up
            for j in range(len(cont_gaze), 20):
                cont_gaze.append(
                    [-1,
                     -1])  # pad dummy gaze to match size for batch processing
            cont_gaze = torch.FloatTensor(cont_gaze)
            gaze_inside = True  # always consider test samples as inside

        else:
            path = self.X_train.iloc[index]
            eye_x, eye_y, gaze_x, gaze_y = self.y_train.iloc[index]
            gaze_inside = True  # bool(inout)

        img = Image.open(os.path.join(self.data_dir, path))
        img = img.convert('RGB')
        width, height = img.size
        # print('gaze coords: ', type(gaze_x), type(gaze_y), gaze_x, gaze_y)
        # print('eye coords: ', type(eye_x), type(eye_y), eye_x, eye_y)
        # expand face bbox a bit
        k = 0.1
        x_min = (eye_x - 0.15) * width
        y_min = (eye_y - 0.15) * height
        x_max = (eye_x + 0.15) * width
        y_max = (eye_y + 0.15) * height
        if x_min < 0:
            x_min = 0
        if y_min < 0:
            y_min = 0
        if x_max < 0:
            x_max = 0
        if y_max < 0:
            y_max = 0
        x_min -= k * abs(x_max - x_min)
        y_min -= k * abs(y_max - y_min)
        x_max += k * abs(x_max - x_min)
        y_max += k * abs(y_max - y_min)

        # x_min = eye_x - 0.15
        # y_min = eye_y - 0.15
        # x_max = eye_x + 0.15
        # y_max = eye_y + 0.15
        # if x_min < 0:
        #     x_min = 0
        # if y_min < 0:
        #     y_min = 0
        # if x_max < 0:
        #     x_max = 0
        # if y_max < 0:
        #     y_max = 0

        # print('bbx',  [x_min, y_min, x_max, y_max])

        x_min, y_min, x_max, y_max = map(float, [x_min, y_min, x_max, y_max])
        # print(x_min, y_min, x_max, y_max)
        if self.imshow:
            img.save("origin_img.jpg")

        if self.test:
            imsize = torch.IntTensor([width, height])
        else:
            ## data augmentation

            # Jitter (expansion-only) bounding box size
            if np.random.random_sample() <= 0.5:
                k = np.random.random_sample() * 0.2
                x_min -= k * abs(x_max - x_min)
                y_min -= k * abs(y_max - y_min)
                x_max += k * abs(x_max - x_min)
                y_max += k * abs(y_max - y_min)

            # Random Crop
            if np.random.random_sample() <= 0.5:
                # Calculate the minimum valid range of the crop that doesn't exclude the face and the gaze target
                crop_x_min = np.min([gaze_x * width, x_min, x_max])
                crop_y_min = np.min([gaze_y * height, y_min, y_max])
                crop_x_max = np.max([gaze_x * width, x_min, x_max])
                crop_y_max = np.max([gaze_y * height, y_min, y_max])

                # Randomly select a random top left corner
                if crop_x_min >= 0:
                    crop_x_min = np.random.uniform(0, crop_x_min)
                if crop_y_min >= 0:
                    crop_y_min = np.random.uniform(0, crop_y_min)

                # Find the range of valid crop width and height starting from the (crop_x_min, crop_y_min)
                crop_width_min = crop_x_max - crop_x_min
                crop_height_min = crop_y_max - crop_y_min
                crop_width_max = width - crop_x_min
                crop_height_max = height - crop_y_min
                # Randomly select a width and a height
                crop_width = np.random.uniform(crop_width_min, crop_width_max)
                crop_height = np.random.uniform(crop_height_min,
                                                crop_height_max)

                # Crop it
                img = TF.crop(img, crop_y_min, crop_x_min, crop_height,
                              crop_width)

                # Record the crop's (x, y) offset
                offset_x, offset_y = crop_x_min, crop_y_min

                # convert coordinates into the cropped frame
                x_min, y_min, x_max, y_max = x_min - offset_x, y_min - offset_y, x_max - offset_x, y_max - offset_y
                # if gaze_inside:
                gaze_x, gaze_y = (gaze_x * width - offset_x) / float(crop_width), \
                                 (gaze_y * height - offset_y) / float(crop_height)
                # else:
                #     gaze_x = -1; gaze_y = -1

                width, height = crop_width, crop_height

            # Random flip
            if np.random.random_sample() <= 0.5:
                img = img.transpose(Image.FLIP_LEFT_RIGHT)
                x_max_2 = width - x_min
                x_min_2 = width - x_max
                x_max = x_max_2
                x_min = x_min_2
                gaze_x = 1 - gaze_x

            # Random color change
            if np.random.random_sample() <= 0.5:
                img = TF.adjust_brightness(img,
                                           brightness_factor=np.random.uniform(
                                               0.5, 1.5))
                img = TF.adjust_contrast(img,
                                         contrast_factor=np.random.uniform(
                                             0.5, 1.5))
                img = TF.adjust_saturation(img,
                                           saturation_factor=np.random.uniform(
                                               0, 1.5))
        # print('bbx2',  [x_min, y_min, x_max, y_max])

        head_channel = imutils.get_head_box_channel(
            x_min,
            y_min,
            x_max,
            y_max,
            width,
            height,
            resolution=self.input_size,
            coordconv=False).unsqueeze(0)

        # Crop the face
        face = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))

        if self.imshow:
            img.save("img_aug.jpg")
            face.save('face_aug.jpg')

        if self.transform is not None:
            img = self.transform(img)
            face = self.transform(face)
        # print('imsize2', img.size())

        # generate the heat map used for deconv prediction
        gaze_heatmap = torch.zeros(
            self.output_size, self.output_size)  # set the size of the output
        # print([gaze_x * self.output_size, gaze_y * self.output_size])
        # print(self.output_size)
        if self.test:  # aggregated heatmap
            num_valid = 0
            for gaze_x, gaze_y in cont_gaze:
                if gaze_x != -1:
                    num_valid += 1
                    gaze_heatmap = imutils.draw_labelmap(
                        gaze_heatmap,
                        [gaze_x * self.output_size, gaze_y * self.output_size],
                        3,
                        type='Gaussian')
            gaze_heatmap /= num_valid
        else:
            # if gaze_inside:
            gaze_heatmap = imutils.draw_labelmap(
                gaze_heatmap,
                [gaze_x * self.output_size, gaze_y * self.output_size],
                3,
                type='Gaussian')

        if self.imshow:
            fig = plt.figure(111)
            img = 255 - imutils.unnorm(img.numpy()) * 255
            img = np.clip(img, 0, 255)
            plt.imshow(np.transpose(img, (1, 2, 0)))
            plt.imshow(imresize(gaze_heatmap,
                                (self.input_size, self.input_size)),
                       cmap='jet',
                       alpha=0.3)
            plt.imshow(imresize(1 - head_channel.squeeze(0),
                                (self.input_size, self.input_size)),
                       alpha=0.2)
            plt.savefig('viz_aug.png')

        if self.test:
            return img, face, head_channel, gaze_heatmap, cont_gaze, imsize, path
        else:
            return img, face, head_channel, gaze_heatmap, path, gaze_inside

Exemple #4

0

Afficher le fichier

Fichier : eval_on_videoatttarget.py Projet : nicole7han/attention-target-detection

def test():
    transform = _get_transform()

    # Prepare data
    print("Loading Data")
    val_dataset = VideoAttTarget_video(videoattentiontarget_val_data,
                                       videoattentiontarget_val_label,
                                       transform=transform,
                                       test=True,
                                       seq_len_limit=50)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=0,
                                             collate_fn=video_pack_sequences)

    # Define device
    device = torch.device('cuda', args.device)

    # Load model
    num_lstm_layers = 2
    print("Constructing model")
    model = ModelSpatioTemporal(num_lstm_layers=num_lstm_layers)
    model.cuda(device)

    print("Loading weights")
    model_dict = model.state_dict()
    snapshot = torch.load(args.model_weights)
    snapshot = snapshot['model']
    model_dict.update(snapshot)
    model.load_state_dict(model_dict)

    print('Evaluation in progress ...')
    model.train(False)
    AUC = []
    in_vs_out_groundtruth = []
    in_vs_out_pred = []
    distance = []
    chunk_size = 3
    with torch.no_grad():
        for batch_val, (img_val, face_val, head_channel_val, gaze_heatmap_val,
                        cont_gaze, inout_label_val,
                        lengths_val) in enumerate(val_loader):
            print('\tprogress = ', batch_val + 1, '/', len(val_loader))
            X_pad_data_img, X_pad_sizes = pack_padded_sequence(
                img_val, lengths_val, batch_first=True)
            X_pad_data_head, _ = pack_padded_sequence(head_channel_val,
                                                      lengths_val,
                                                      batch_first=True)
            X_pad_data_face, _ = pack_padded_sequence(face_val,
                                                      lengths_val,
                                                      batch_first=True)
            Y_pad_data_cont_gaze, _ = pack_padded_sequence(cont_gaze,
                                                           lengths_val,
                                                           batch_first=True)
            Y_pad_data_heatmap, _ = pack_padded_sequence(gaze_heatmap_val,
                                                         lengths_val,
                                                         batch_first=True)
            Y_pad_data_inout, _ = pack_padded_sequence(inout_label_val,
                                                       lengths_val,
                                                       batch_first=True)

            hx = (torch.zeros(
                (num_lstm_layers, args.batch_size, 512, 7, 7)).cuda(device),
                  torch.zeros((num_lstm_layers, args.batch_size, 512, 7,
                               7)).cuda(device)
                  )  # (num_layers, batch_size, feature dims)
            last_index = 0
            previous_hx_size = args.batch_size

            for i in range(0, lengths_val[0], chunk_size):
                X_pad_sizes_slice = X_pad_sizes[i:i + chunk_size].cuda(device)
                curr_length = np.sum(X_pad_sizes_slice.cpu().detach().numpy())
                # slice padded data
                X_pad_data_slice_img = X_pad_data_img[last_index:last_index +
                                                      curr_length].cuda(device)
                X_pad_data_slice_head = X_pad_data_head[last_index:last_index +
                                                        curr_length].cuda(
                                                            device)
                X_pad_data_slice_face = X_pad_data_face[last_index:last_index +
                                                        curr_length].cuda(
                                                            device)
                Y_pad_data_slice_cont_gaze = Y_pad_data_cont_gaze[
                    last_index:last_index + curr_length].cuda(device)
                Y_pad_data_slice_heatmap = Y_pad_data_heatmap[
                    last_index:last_index + curr_length].cuda(device)
                Y_pad_data_slice_inout = Y_pad_data_inout[
                    last_index:last_index + curr_length].cuda(device)
                last_index += curr_length

                # detach previous hidden states to stop gradient flow
                prev_hx = (hx[0][:, :min(X_pad_sizes_slice[0], previous_hx_size
                                         ), :, :, :].detach(),
                           hx[1][:, :min(X_pad_sizes_slice[0], previous_hx_size
                                         ), :, :, :].detach())

                # forward pass
                deconv, inout_val, hx = model(X_pad_data_slice_img, X_pad_data_slice_head, X_pad_data_slice_face, \
                                                         hidden_scene=prev_hx, batch_sizes=X_pad_sizes_slice)

                for b_i in range(len(Y_pad_data_slice_cont_gaze)):
                    if Y_pad_data_slice_inout[b_i]:  # ONLY for 'inside' cases
                        # AUC: area under curve of ROC
                        multi_hot = torch.zeros(
                            output_resolution,
                            output_resolution)  # set the size of the output
                        gaze_x = Y_pad_data_slice_cont_gaze[b_i, 0]
                        gaze_y = Y_pad_data_slice_cont_gaze[b_i, 1]
                        multi_hot = imutils.draw_labelmap(multi_hot, [
                            gaze_x * output_resolution,
                            gaze_y * output_resolution
                        ],
                                                          3,
                                                          type='Gaussian')
                        multi_hot = (multi_hot > 0).float(
                        ) * 1  # make GT heatmap as binary labels
                        multi_hot = misc.to_numpy(multi_hot)

                        scaled_heatmap = imresize(
                            deconv[b_i].squeeze(),
                            (output_resolution, output_resolution),
                            interp='bilinear')
                        auc_score = evaluation.auc(scaled_heatmap, multi_hot)
                        AUC.append(auc_score)

                        # distance: L2 distance between ground truth and argmax point
                        pred_x, pred_y = evaluation.argmax_pts(
                            deconv[b_i].squeeze())
                        norm_p = [
                            pred_x / output_resolution,
                            pred_y / output_resolution
                        ]
                        dist_score = evaluation.L2_dist(
                            Y_pad_data_slice_cont_gaze[b_i], norm_p).item()
                        distance.append(dist_score)

                # in vs out classification
                in_vs_out_groundtruth.extend(
                    Y_pad_data_slice_inout.cpu().numpy())
                in_vs_out_pred.extend(inout_val.cpu().numpy())

                previous_hx_size = X_pad_sizes_slice[-1]

            try:
                print("\tAUC:{:.4f}"
                      "\tdist:{:.4f}"
                      "\tin vs out AP:{:.4f}".format(
                          torch.mean(torch.tensor(AUC)),
                          torch.mean(torch.tensor(distance)),
                          evaluation.ap(in_vs_out_groundtruth,
                                        in_vs_out_pred)))
            except:
                pass

    print("Summary ")
    print("\tAUC:{:.4f}"
          "\tdist:{:.4f}"
          "\tin vs out AP:{:.4f}".format(
              torch.mean(torch.tensor(AUC)),
              torch.mean(torch.tensor(distance)),
              evaluation.ap(in_vs_out_groundtruth, in_vs_out_pred)))