def process_frame(self, frame_raw, box, canvas=None):
        """
        Takes a frame and draws gaze estimation
        Parameters
        ==========
            frame_raw: (np.ndarray) - an image from opencv
            box: (tuple[Int]) - a TLBR bounding box
            canvas: (np.ndarray) - an image to draw to instead of frame
        Returns
        =======
            canvas: (np.ndarray) - the modified frame/canvas
        """
        if isinstance(frame_raw, np.ndarray):
            frame_raw = Image.fromarray(frame_raw)

        if canvas is None:
            canvas = frame_raw
        elif isinstance(canvas, np.ndarray):
            canvas = Image.fromarray(canvas)

        frame_raw = frame_raw.convert('RGB')
        width, height = frame_raw.size

        with torch.no_grad():
            head = frame_raw.crop((box))
            head = self.test_transforms(head)

            head_channel = imutils.get_head_box_channel(
                box[0],
                box[1],
                box[2],
                box[3],
                width,
                height,
                resolution=input_resolution).unsqueeze(0)
            frame = self.test_transforms(frame_raw)

            head = head.unsqueeze(0).to(self.device)
            frame = frame.unsqueeze(0).to(self.device)
            head_channel = head_channel.unsqueeze(0).to(self.device)

            raw_hm, _, inout = self.model(frame, head_channel, head)

            raw_hm = raw_hm.cpu().detach().numpy() * 255
            raw_hm = raw_hm.squeeze()
            inout = inout.cpu().detach().numpy()
            inout = 1 / (1 + np.exp(-inout))
            inout = (1 - inout) * 255
            # norm_map = imresize(raw_hm, (height, width)) - inout

            if self.vis_mode == 'arrow':
                if inout < self.out_threshold:  # in-frame gaze
                    pred_x, pred_y = evaluation.argmax_pts(raw_hm)
                    norm_p = [
                        pred_x / output_resolution, pred_y / output_resolution
                    ]

                    draw = ImageDraw.Draw(canvas)
                    draw.rectangle([(box[0], box[1]), (box[2], box[3])],
                                   outline="green",
                                   width=3)
                    heatmap_center = (norm_p[0] * width, norm_p[1] * height)
                    draw.line([
                        heatmap_center,
                        (box[0] + (box[2] - box[0]) // 2, box[1] +
                         (box[3] - box[1]) // 2)
                    ],
                              fill="green",
                              width=3)
                    draw.ellipse(
                        [(heatmap_center[0] - 10, heatmap_center[1] - 10),
                         (heatmap_center[0] + 10, heatmap_center[1] + 10)],
                        fill="green")
            else:
                raise Exception(f"vis_mode {self.vis_mode} is not supported")

            return canvas
    def __getitem__(self, index):
        if self.test:
            g = self.X_test.get_group(self.keys[index])
            cont_gaze = []
            for i, row in g.iterrows():
                path = row['path']
                x_min = row['bbox_x_min']
                y_min = row['bbox_y_min']
                x_max = row['bbox_x_max']
                y_max = row['bbox_y_max']
                eye_x = row['eye_x']
                eye_y = row['eye_y']
                gaze_x = row['gaze_x']
                gaze_y = row['gaze_y']
                cont_gaze.append([gaze_x, gaze_y
                                  ])  # all ground truth gaze are stacked up
            for j in range(len(cont_gaze), 20):
                cont_gaze.append(
                    [-1,
                     -1])  # pad dummy gaze to match size for batch processing
            cont_gaze = torch.FloatTensor(cont_gaze)
            gaze_inside = True  # always consider test samples as inside

        else:
            path = self.X_train.iloc[index]
            eye_x, eye_y, gaze_x, gaze_y = self.y_train.iloc[index]
            gaze_inside = True  # bool(inout)

        img = Image.open(os.path.join(self.data_dir, path))
        img = img.convert('RGB')
        width, height = img.size
        # print('gaze coords: ', type(gaze_x), type(gaze_y), gaze_x, gaze_y)
        # print('eye coords: ', type(eye_x), type(eye_y), eye_x, eye_y)
        # expand face bbox a bit
        k = 0.1
        x_min = (eye_x - 0.15) * width
        y_min = (eye_y - 0.15) * height
        x_max = (eye_x + 0.15) * width
        y_max = (eye_y + 0.15) * height
        if x_min < 0:
            x_min = 0
        if y_min < 0:
            y_min = 0
        if x_max < 0:
            x_max = 0
        if y_max < 0:
            y_max = 0
        x_min -= k * abs(x_max - x_min)
        y_min -= k * abs(y_max - y_min)
        x_max += k * abs(x_max - x_min)
        y_max += k * abs(y_max - y_min)

        # x_min = eye_x - 0.15
        # y_min = eye_y - 0.15
        # x_max = eye_x + 0.15
        # y_max = eye_y + 0.15
        # if x_min < 0:
        #     x_min = 0
        # if y_min < 0:
        #     y_min = 0
        # if x_max < 0:
        #     x_max = 0
        # if y_max < 0:
        #     y_max = 0

        # print('bbx',  [x_min, y_min, x_max, y_max])

        x_min, y_min, x_max, y_max = map(float, [x_min, y_min, x_max, y_max])
        # print(x_min, y_min, x_max, y_max)
        if self.imshow:
            img.save("origin_img.jpg")

        if self.test:
            imsize = torch.IntTensor([width, height])
        else:
            ## data augmentation

            # Jitter (expansion-only) bounding box size
            if np.random.random_sample() <= 0.5:
                k = np.random.random_sample() * 0.2
                x_min -= k * abs(x_max - x_min)
                y_min -= k * abs(y_max - y_min)
                x_max += k * abs(x_max - x_min)
                y_max += k * abs(y_max - y_min)

            # Random Crop
            if np.random.random_sample() <= 0.5:
                # Calculate the minimum valid range of the crop that doesn't exclude the face and the gaze target
                crop_x_min = np.min([gaze_x * width, x_min, x_max])
                crop_y_min = np.min([gaze_y * height, y_min, y_max])
                crop_x_max = np.max([gaze_x * width, x_min, x_max])
                crop_y_max = np.max([gaze_y * height, y_min, y_max])

                # Randomly select a random top left corner
                if crop_x_min >= 0:
                    crop_x_min = np.random.uniform(0, crop_x_min)
                if crop_y_min >= 0:
                    crop_y_min = np.random.uniform(0, crop_y_min)

                # Find the range of valid crop width and height starting from the (crop_x_min, crop_y_min)
                crop_width_min = crop_x_max - crop_x_min
                crop_height_min = crop_y_max - crop_y_min
                crop_width_max = width - crop_x_min
                crop_height_max = height - crop_y_min
                # Randomly select a width and a height
                crop_width = np.random.uniform(crop_width_min, crop_width_max)
                crop_height = np.random.uniform(crop_height_min,
                                                crop_height_max)

                # Crop it
                img = TF.crop(img, crop_y_min, crop_x_min, crop_height,
                              crop_width)

                # Record the crop's (x, y) offset
                offset_x, offset_y = crop_x_min, crop_y_min

                # convert coordinates into the cropped frame
                x_min, y_min, x_max, y_max = x_min - offset_x, y_min - offset_y, x_max - offset_x, y_max - offset_y
                # if gaze_inside:
                gaze_x, gaze_y = (gaze_x * width - offset_x) / float(crop_width), \
                                 (gaze_y * height - offset_y) / float(crop_height)
                # else:
                #     gaze_x = -1; gaze_y = -1

                width, height = crop_width, crop_height

            # Random flip
            if np.random.random_sample() <= 0.5:
                img = img.transpose(Image.FLIP_LEFT_RIGHT)
                x_max_2 = width - x_min
                x_min_2 = width - x_max
                x_max = x_max_2
                x_min = x_min_2
                gaze_x = 1 - gaze_x

            # Random color change
            if np.random.random_sample() <= 0.5:
                img = TF.adjust_brightness(img,
                                           brightness_factor=np.random.uniform(
                                               0.5, 1.5))
                img = TF.adjust_contrast(img,
                                         contrast_factor=np.random.uniform(
                                             0.5, 1.5))
                img = TF.adjust_saturation(img,
                                           saturation_factor=np.random.uniform(
                                               0, 1.5))
        # print('bbx2',  [x_min, y_min, x_max, y_max])

        head_channel = imutils.get_head_box_channel(
            x_min,
            y_min,
            x_max,
            y_max,
            width,
            height,
            resolution=self.input_size,
            coordconv=False).unsqueeze(0)

        # Crop the face
        face = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))

        if self.imshow:
            img.save("img_aug.jpg")
            face.save('face_aug.jpg')

        if self.transform is not None:
            img = self.transform(img)
            face = self.transform(face)
        # print('imsize2', img.size())

        # generate the heat map used for deconv prediction
        gaze_heatmap = torch.zeros(
            self.output_size, self.output_size)  # set the size of the output
        # print([gaze_x * self.output_size, gaze_y * self.output_size])
        # print(self.output_size)
        if self.test:  # aggregated heatmap
            num_valid = 0
            for gaze_x, gaze_y in cont_gaze:
                if gaze_x != -1:
                    num_valid += 1
                    gaze_heatmap = imutils.draw_labelmap(
                        gaze_heatmap,
                        [gaze_x * self.output_size, gaze_y * self.output_size],
                        3,
                        type='Gaussian')
            gaze_heatmap /= num_valid
        else:
            # if gaze_inside:
            gaze_heatmap = imutils.draw_labelmap(
                gaze_heatmap,
                [gaze_x * self.output_size, gaze_y * self.output_size],
                3,
                type='Gaussian')

        if self.imshow:
            fig = plt.figure(111)
            img = 255 - imutils.unnorm(img.numpy()) * 255
            img = np.clip(img, 0, 255)
            plt.imshow(np.transpose(img, (1, 2, 0)))
            plt.imshow(imresize(gaze_heatmap,
                                (self.input_size, self.input_size)),
                       cmap='jet',
                       alpha=0.3)
            plt.imshow(imresize(1 - head_channel.squeeze(0),
                                (self.input_size, self.input_size)),
                       alpha=0.2)
            plt.savefig('viz_aug.png')

        if self.test:
            return img, face, head_channel, gaze_heatmap, cont_gaze, imsize, path
        else:
            return img, face, head_channel, gaze_heatmap, path, gaze_inside
Ejemplo n.º 3
0
def run(args):
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    column_names = ['frame', 'left', 'top', 'right', 'bottom']
    df = pd.read_csv(args.head, names=column_names, index_col=0)
    df['left'] -= (df['right'] - df['left']) * 0.1
    df['right'] += (df['right'] - df['left']) * 0.1
    df['top'] -= (df['bottom'] - df['top']) * 0.1
    df['bottom'] += (df['bottom'] - df['top']) * 0.1

    # set up data transformation
    test_transforms = _get_transform()

    model = ModelSpatial()
    model_dict = model.state_dict()
    pretrained_dict = torch.load(args.model_weights, map_location=device)
    pretrained_dict = pretrained_dict['model']
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)

    model.to(device)
    model.train(False)

    with torch.no_grad():
        for i in df.index:
            frame_raw = Image.open(os.path.join(args.image_dir, i))
            frame_raw = frame_raw.convert('RGB')
            width, height = frame_raw.size

            head_box = [
                df.loc[i, 'left'], df.loc[i, 'top'], df.loc[i, 'right'],
                df.loc[i, 'bottom']
            ]

            head = frame_raw.crop((head_box))  # head crop

            head = test_transforms(head)  # transform inputs
            frame = test_transforms(frame_raw)
            head_channel = imutils.get_head_box_channel(
                head_box[0],
                head_box[1],
                head_box[2],
                head_box[3],
                width,
                height,
                resolution=input_resolution).unsqueeze(0)

            head = head.unsqueeze(0).to(device)
            frame = frame.unsqueeze(0).to(device)
            head_channel = head_channel.unsqueeze(0).to(device)

            # forward pass
            raw_hm, _, inout = model(frame, head_channel, head)

            # heatmap modulation
            raw_hm = raw_hm.cpu().detach().numpy() * 255
            raw_hm = raw_hm.squeeze()
            inout = inout.cpu().detach().numpy()
            inout = 1 / (1 + np.exp(-inout))
            inout = (1 - inout) * 255
            norm_map = imresize(raw_hm, (height, width)) - inout

            # vis
            # plt.close()
            fig = plt.figure()
            # fig.canvas.manager.window.move(0,0)
            plt.axis('off')
            plt.imshow(frame_raw)

            ax = plt.gca()
            rect = patches.Rectangle((head_box[0], head_box[1]),
                                     head_box[2] - head_box[0],
                                     head_box[3] - head_box[1],
                                     linewidth=2,
                                     edgecolor=(0, 1, 0),
                                     facecolor='none')
            ax.add_patch(rect)

            if args.vis_mode == 'arrow':
                if inout < args.out_threshold:  # in-frame gaze
                    pred_x, pred_y = evaluation.argmax_pts(raw_hm)
                    norm_p = [
                        pred_x / output_resolution, pred_y / output_resolution
                    ]
                    circ = patches.Circle(
                        (norm_p[0] * width, norm_p[1] * height),
                        height / 50.0,
                        facecolor=(0, 1, 0),
                        edgecolor='none')
                    ax.add_patch(circ)
                    plt.plot(
                        (norm_p[0] * width, (head_box[0] + head_box[2]) / 2),
                        (norm_p[1] * height, (head_box[1] + head_box[3]) / 2),
                        '-',
                        color=(0, 1, 0, 1))
                    plt.show()
            else:
                plt.imshow(norm_map, cmap='jet', alpha=0.2, vmin=0, vmax=255)
                plt.show(block=False)

            plt.show(block=True)

        print('DONE!')
    def __getitem__(self, index):
        sequence_path = self.all_sequence_paths[index]
        df = pd.read_csv(
            sequence_path,
            header=None,
            index_col=False,
            names=['path', 'xmin', 'ymin', 'xmax', 'ymax', 'gazex', 'gazey'])
        show_name = sequence_path.split('/')[-3]
        clip = sequence_path.split('/')[-2]
        seq_len = len(df.index)

        # moving-avg smoothing
        window_size = 11  # should be odd number
        df['xmin'] = myutils.smooth_by_conv(window_size, df, 'xmin')
        df['ymin'] = myutils.smooth_by_conv(window_size, df, 'ymin')
        df['xmax'] = myutils.smooth_by_conv(window_size, df, 'xmax')
        df['ymax'] = myutils.smooth_by_conv(window_size, df, 'ymax')

        if not self.test:
            # cond for data augmentation
            cond_jitter = np.random.random_sample()
            cond_flip = np.random.random_sample()
            cond_color = np.random.random_sample()
            if cond_color < 0.5:
                n1 = np.random.uniform(0.5, 1.5)
                n2 = np.random.uniform(0.5, 1.5)
                n3 = np.random.uniform(0.5, 1.5)
            cond_crop = np.random.random_sample()

            # if longer than seq_len_limit, cut it down to the limit with the init index randomly sampled
            if seq_len > self.seq_len_limit:
                sampled_ind = np.random.randint(0,
                                                seq_len - self.seq_len_limit)
                seq_len = self.seq_len_limit
            else:
                sampled_ind = 0

            if cond_crop < 0.5:
                sliced_x_min = df['xmin'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_x_max = df['xmax'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_y_min = df['ymin'].iloc[sampled_ind:sampled_ind +
                                               seq_len]
                sliced_y_max = df['ymax'].iloc[sampled_ind:sampled_ind +
                                               seq_len]

                sliced_gaze_x = df['gazex'].iloc[sampled_ind:sampled_ind +
                                                 seq_len]
                sliced_gaze_y = df['gazey'].iloc[sampled_ind:sampled_ind +
                                                 seq_len]

                check_sum = sliced_gaze_x.sum() + sliced_gaze_y.sum()
                all_outside = check_sum == -2 * seq_len

                # Calculate the minimum valid range of the crop that doesn't exclude the face and the gaze target
                if all_outside:
                    crop_x_min = np.min(
                        [sliced_x_min.min(),
                         sliced_x_max.min()])
                    crop_y_min = np.min(
                        [sliced_y_min.min(),
                         sliced_y_max.min()])
                    crop_x_max = np.max(
                        [sliced_x_min.max(),
                         sliced_x_max.max()])
                    crop_y_max = np.max(
                        [sliced_y_min.max(),
                         sliced_y_max.max()])
                else:
                    crop_x_min = np.min([
                        sliced_gaze_x.min(),
                        sliced_x_min.min(),
                        sliced_x_max.min()
                    ])
                    crop_y_min = np.min([
                        sliced_gaze_y.min(),
                        sliced_y_min.min(),
                        sliced_y_max.min()
                    ])
                    crop_x_max = np.max([
                        sliced_gaze_x.max(),
                        sliced_x_min.max(),
                        sliced_x_max.max()
                    ])
                    crop_y_max = np.max([
                        sliced_gaze_y.max(),
                        sliced_y_min.max(),
                        sliced_y_max.max()
                    ])

                # Randomly select a random top left corner
                if crop_x_min >= 0:
                    crop_x_min = np.random.uniform(0, crop_x_min)
                if crop_y_min >= 0:
                    crop_y_min = np.random.uniform(0, crop_y_min)

                # Get image size
                path = os.path.join(self.data_dir, show_name, clip,
                                    df['path'].iloc[0])
                img = Image.open(path)
                img = img.convert('RGB')
                width, height = img.size

                # Find the range of valid crop width and height starting from the (crop_x_min, crop_y_min)
                crop_width_min = crop_x_max - crop_x_min
                crop_height_min = crop_y_max - crop_y_min
                crop_width_max = width - crop_x_min
                crop_height_max = height - crop_y_min
                # Randomly select a width and a height
                crop_width = np.random.uniform(crop_width_min, crop_width_max)
                crop_height = np.random.uniform(crop_height_min,
                                                crop_height_max)
        else:
            sampled_ind = 0


        faces, images, head_channels, heatmaps, paths, gazes, imsizes, gaze_inouts = [], [], [], [], [], [], [], []
        index_tracker = -1
        for i, row in df.iterrows():
            index_tracker = index_tracker + 1
            if not self.test:
                if index_tracker < sampled_ind or index_tracker >= (
                        sampled_ind + self.seq_len_limit):
                    continue

            face_x1 = row['xmin']  # note: Already in image coordinates
            face_y1 = row['ymin']  # note: Already in image coordinates
            face_x2 = row['xmax']  # note: Already in image coordinates
            face_y2 = row['ymax']  # note: Already in image coordinates
            gaze_x = row['gazex']  # note: Already in image coordinates
            gaze_y = row['gazey']  # note: Already in image coordinates

            impath = os.path.join(self.data_dir, show_name, clip, row['path'])
            img = Image.open(impath)
            img = img.convert('RGB')

            width, height = img.size
            imsize = torch.FloatTensor([width, height])
            # imsizes.append(imsize)

            face_x1, face_y1, face_x2, face_y2 = map(
                float, [face_x1, face_y1, face_x2, face_y2])
            gaze_x, gaze_y = map(float, [gaze_x, gaze_y])
            if gaze_x == -1 and gaze_y == -1:
                gaze_inside = False
            else:
                if gaze_x < 0:  # move gaze point that was sliglty outside the image back in
                    gaze_x = 0
                if gaze_y < 0:
                    gaze_y = 0
                gaze_inside = True

            if not self.test:
                ## data augmentation
                # Jitter (expansion-only) bounding box size.
                if cond_jitter < 0.5:
                    k = cond_jitter * 0.1
                    face_x1 -= k * abs(face_x2 - face_x1)
                    face_y1 -= k * abs(face_y2 - face_y1)
                    face_x2 += k * abs(face_x2 - face_x1)
                    face_y2 += k * abs(face_y2 - face_y1)
                    face_x1 = np.clip(face_x1, 0, width)
                    face_x2 = np.clip(face_x2, 0, width)
                    face_y1 = np.clip(face_y1, 0, height)
                    face_y2 = np.clip(face_y2, 0, height)

                # Random Crop
                if cond_crop < 0.5:
                    # Crop it
                    img = TF.crop(img, crop_y_min, crop_x_min, crop_height,
                                  crop_width)

                    # Record the crop's (x, y) offset
                    offset_x, offset_y = crop_x_min, crop_y_min

                    # convert coordinates into the cropped frame
                    face_x1, face_y1, face_x2, face_y2 = face_x1 - offset_x, face_y1 - offset_y, face_x2 - offset_x, face_y2 - offset_y
                    if gaze_inside:
                        gaze_x, gaze_y = (gaze_x- offset_x), \
                                         (gaze_y - offset_y)
                    else:
                        gaze_x = -1
                        gaze_y = -1

                    width, height = crop_width, crop_height

                # Flip?
                if cond_flip < 0.5:
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                    x_max_2 = width - face_x1
                    x_min_2 = width - face_x2
                    face_x2 = x_max_2
                    face_x1 = x_min_2
                    if gaze_x != -1 and gaze_y != -1:
                        gaze_x = width - gaze_x

                # Random color change
                if cond_color < 0.5:
                    img = TF.adjust_brightness(img, brightness_factor=n1)
                    img = TF.adjust_contrast(img, contrast_factor=n2)
                    img = TF.adjust_saturation(img, saturation_factor=n3)

            # Face crop
            face = img.copy().crop(
                (int(face_x1), int(face_y1), int(face_x2), int(face_y2)))

            # Head channel image
            head_channel = imutils.get_head_box_channel(
                face_x1,
                face_y1,
                face_x2,
                face_y2,
                width,
                height,
                resolution=self.input_size,
                coordconv=False).unsqueeze(0)
            if self.transform is not None:
                img = self.transform(img)
                face = self.transform(face)

            # Deconv output
            if gaze_inside:
                gaze_x /= float(width)  # fractional gaze
                gaze_y /= float(height)
                gaze_heatmap = torch.zeros(
                    self.output_size,
                    self.output_size)  # set the size of the output
                gaze_map = imutils.draw_labelmap(
                    gaze_heatmap,
                    [gaze_x * self.output_size, gaze_y * self.output_size],
                    3,
                    type='Gaussian')
                gazes.append(torch.FloatTensor([gaze_x, gaze_y]))
            else:
                gaze_map = torch.zeros(self.output_size, self.output_size)
                gazes.append(torch.FloatTensor([-1, -1]))
            faces.append(face)
            images.append(img)
            head_channels.append(head_channel)
            heatmaps.append(gaze_map)
            gaze_inouts.append(torch.FloatTensor([int(gaze_inside)]))

        if self.imshow:
            for i in range(len(faces)):
                fig = plt.figure(111)
                img = 255 - imutils.unnorm(images[i].numpy()) * 255
                img = np.clip(img, 0, 255)
                plt.imshow(np.transpose(img, (1, 2, 0)))
                plt.imshow(imresize(heatmaps[i],
                                    (self.input_size, self.input_size)),
                           cmap='jet',
                           alpha=0.3)
                plt.imshow(imresize(1 - head_channels[i].squeeze(0),
                                    (self.input_size, self.input_size)),
                           alpha=0.2)
                plt.savefig(
                    os.path.join('debug',
                                 'viz_%d_inout=%d.png' % (i, gaze_inouts[i])))
                plt.close('all')

        faces = torch.stack(faces)
        images = torch.stack(images)
        head_channels = torch.stack(head_channels)
        heatmaps = torch.stack(heatmaps)
        gazes = torch.stack(gazes)
        gaze_inouts = torch.stack(gaze_inouts)
        # imsizes = torch.stack(imsizes)
        # print(faces.shape, images.shape, head_channels.shape, heatmaps.shape)

        if self.test:
            return images, faces, head_channels, heatmaps, gazes, gaze_inouts
        else:  # train
            return images, faces, head_channels, heatmaps, gaze_inouts