コード例 #1
0
class TestYOLOv4SceneSensor(object):
    def setup_class(self):
        self.roi_feat_resolution = 5
        self.scene_sensor = SceneSensor(
            YOLOv4_MODEL,
            gpu=0,
            img_shape=[3, 416, 416],
            roi_feat_resolution=self.roi_feat_resolution,
            algorithm='yolov4')
        self.frames = clip_video_to_frames(R2_VIDEO, 0., None)

    def test_get_instances(self, export=True):
        instances_lst = self.scene_sensor.get_instances(self.frames)
        assert len(instances_lst) == len(self.frames)

        if export:
            h, w, fps = 720, 1280, 24.  # read from VIDEO
            video_writer = VideoWriter('data/scene_yolo4_demo.mp4', (w, h),
                                       fps)
            for frame, instances in zip(self.frames, instances_lst):
                bboxes = np.array([i['bbox'] for i in instances])
                labels = [i['category'] for i in instances]

                frame_draw = draw_bboxes(frame, bboxes, labels=labels)
                video_writer.add_frame(frame_draw)
            video_writer.close()

    def test_get_instances_with_feats(self):
        instances_lst, fm_lst = self.scene_sensor.get_instances_with_feats(
            self.frames, get_full_fm=True)
        _, h, w = instances_lst[0][0]['fm'].shape
        assert h == w == self.roi_feat_resolution
        assert len(instances_lst) == len(fm_lst) == len(self.frames)
コード例 #2
0
class TestYOLOv3SceneSensor(object):
    def setup_class(self):
        self.scene_sensor = SceneSensor(YOLOv3_MODEL,
                                        gpu=0,
                                        algorithm='yolov3')
        self.frames = clip_video_to_frames(VIDEO, 3001., 4000.)

    def test_get_instances(self, export=True):
        instances_lst = self.scene_sensor.get_instances(self.frames)
        assert len(instances_lst) == len(self.frames)

        if export:
            h, w, fps = 480, 640, 24.  # read from VIDEO
            video_writer = VideoWriter('data/scene_yolo_demo.mp4', (w, h), fps)
            for frame, instances in zip(self.frames, instances_lst):
                bboxes = np.array([i['bbox'] for i in instances])
                labels = [i['category'] for i in instances]

                frame_draw = draw_bboxes(frame, bboxes, labels=labels)
                video_writer.add_frame(frame_draw)
            video_writer.close()

    def test_get_feature_map(self):
        feature_maps = self.scene_sensor.get_feature_map(self.frames)
        assert len(feature_maps) == len(self.frames)
コード例 #3
0
 def setup_class(self):
     self.roi_feat_resolution = 5
     self.scene_sensor = SceneSensor(
         YOLOv4_MODEL,
         gpu=0,
         img_shape=[3, 416, 416],
         roi_feat_resolution=self.roi_feat_resolution,
         algorithm='yolov4')
     self.frames = clip_video_to_frames(R2_VIDEO, 0., None)
コード例 #4
0
 def _build_detector_program(self):
     self.detector_prog = fluid.Program()
     self.detector_startup_prog = fluid.Program()
     with fluid.program_guard(self.detector_prog,
                              self.detector_startup_prog):
         yolov4_detector = SceneSensor.network(
             self.input_shape, 'yolov4',
             get_roi_feat=False,
             roi_feat_resolution=self.roi_feat_resolution)
         feed_list, fetch_list = yolov4_detector.build()
         self.detector_feeds = [i.name for i in feed_list]
         self.detector_fetch = fetch_list
コード例 #5
0
    def worker_func(in_queue, out_queue, msg_queue, conf_dict):
        scene_sensor = SceneSensor(
            conf_dict['yolov4_model_dir'],
            gpu=conf_dict['gpu'],
            img_shape=[3, 416, 416],
            roi_feat_resolution=conf_dict['roi_feat_resolution'],
            algorithm='yolov4')

        while True:
            try:
                msg = msg_queue.get_nowait()
            except Empty:
                msg = ''

            if msg == 'stop':
                break

            try:
                anno = in_queue.get(timeout=5)
            except Empty:
                anno = None

            if anno is not None:
                if Enable_Time_Log:
                    t1 = time.time()

                if 'Cache' not in anno:
                    frames = [pickle.loads(i) for i in anno['Frames']]
                    anno['Instances'] = scene_sensor.get_instances_with_feats(
                        frames, get_full_fm=False)
                    del anno['Frames']  # to save memory!

                out_queue.put(anno)
                if Enable_Time_Log:
                    t2 = time.time()
                    print('Detector takes {:.3f}s'.format(t2 - t1))
コード例 #6
0
def run_worker(tasks, gpu_id, encoder_model, yolov4_model, output_dir,
               max_cosine_distance, resume):
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id
    from perception.tracker.re_id import create_box_encoder, \
        NearestNeighborDistanceMetric
    from perception.tracker.tracker import Tracker, Detection
    from perception.scene.eval import SceneSensor

    encoder = create_box_encoder(encoder_model, batch_size=8)
    metric = NearestNeighborDistanceMetric('cosine', max_cosine_distance, None)
    tracker = Tracker(metric)
    detector = SceneSensor(yolov4_model,
                           gpu=0,
                           img_shape=[3, 416, 416],
                           algorithm='yolov4')

    for video_file in tasks:
        task_id = os.path.basename(video_file)[:-len('.mp4')]
        if resume is not None:
            if resume != task_id:
                continue
            else:
                resume = None

        clip = VideoFileClip(video_file)
        track_video = os.path.join(output_dir, '{}_track.mp4'.format(task_id))
        video_writer = VideoWriter(track_video, (clip.w, clip.h), clip.fps)

        tracker_logs = []
        for frame in clip.iter_frames():
            frame = frame[:, :, ::-1]
            instances = detector.get_instances(frame)[0]
            boxes = [ins['bbox'] for ins in instances]
            features = encoder(frame, boxes)

            detections = [
                Detection(ins, feat) for ins, feat in zip(instances, features)
            ]

            tracker.predict()
            tracker.update(detections)

            track_log = dict()
            for track in tracker.tracks:
                if not track.is_confirmed() or track.time_since_update > 1:
                    continue

                bbox = track.to_tlbr()
                track_log[str(track.track_id)] = bbox

                # NOTE: https://github.com/opencv/opencv/issues/14866
                # We have to add this line
                frame = np.array(frame)
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2)
                cv2.putText(frame, str(track.track_id),
                            (int(bbox[0]), int(bbox[1] + 23)), 0, 5e-3 * 100,
                            (0, 255, 0), 2)

            det_log = []
            for det in detections:
                if str(det.cls) != 'person':
                    continue

                bbox = det.to_tlbr()
                score = "%.2f" % round(det.confidence * 100, 2) + "%"
                det_log.append(bbox)

                # NOTE: https://github.com/opencv/opencv/issues/14866
                # We have to add this line
                frame = np.array(frame)

                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
                cv2.putText(frame, score, (int(bbox[0]), int(bbox[3])), 0,
                            5e-3 * 100, (0, 255, 0), 2)

            tracker_logs.append((track_log, det_log))
            video_writer.add_frame(frame)

        video_writer.close()
        convert_to_h264(track_video)
        print('Saved {}'.format(track_video))

        tracker_logs_file = os.path.join(output_dir,
                                         '{}_states.pkl'.format(task_id))
        with open(tracker_logs_file, 'wb') as f:
            pickle.dump(tracker_logs, f)
        print('Saved {}'.format(tracker_logs_file))

        tracker.reset()
コード例 #7
0
class SalutationClsDataset(object):
    def __init__(self, video_tracking_dir, anno_dir,
                 yolov4_model_dir, roi_feat_resolution=5, gpu=0):
        self.video_tracking_dir = video_tracking_dir
        self.anno_dir = anno_dir
        self.yolov4_model_dir = yolov4_model_dir
        self.roi_feat_resolution = roi_feat_resolution
        self.gpu = gpu

        self._collect_annotations()
        self._split_train_test_sets(test_percentage=0.2)

    def _collect_annotations(self):
        self.annos = []
        for anno_file in os.listdir(self.anno_dir):
            video_id = '_'.join(anno_file.split('_')[:2])
            print(video_id)

            with open(os.path.join(self.anno_dir, anno_file), 'r') as f:
                for line in f.readlines():
                    anno = json.loads(line)
                    anno['VideoID'] = video_id
                    if anno['Salutation'] != 'null':
                        self.annos.append(anno)

    def _split_train_test_sets(self, test_percentage=0.2):
        # Copy from XiaoduHiDataloaderv2
        videos = set([anno['VideoID'] for anno in self.annos])
        num_test = int(len(videos) * test_percentage)

        ids = np.arange(len(videos))
        np.random.shuffle(ids)
        videos = [list(videos)[i] for i in ids]
        test_videos = set(videos[:num_test])

        self.test_annos, self.train_annos = [], []
        for anno in self.annos:
            if anno['VideoID'] in test_videos:
                self.test_annos.append(anno)
            else:
                self.train_annos.append(anno)

        ids = np.arange(len(self.train_annos))
        np.random.shuffle(ids)
        self.train_annos = [self.train_annos[i] for i in list(ids)]

    def _process_single_anno(self, idx, anno, txt, data_dir):
        if not hasattr(self, "scene_sensor"):
            self.scene_sensor = SceneSensor(
                self.yolov4_model_dir,
                gpu=self.gpu,
                img_shape=[3, 416, 416],
                roi_feat_resolution=self.roi_feat_resolution,
                algorithm='yolov4')

        # Read annos and data
        track_states_file = os.path.join(
            self.video_tracking_dir, '{}_states.pkl'.format(anno['VideoID']))
        with open(track_states_file, 'rb') as f:
            track_states = pickle.load(f)

        video_file = os.path.join(
            self.video_tracking_dir, '{}.mp4'.format(anno['VideoID']))
        frames = clip_video_to_frames(video_file, 0.0, None)

        # Extract frames
        related_frames, related_tracks = [], []
        for frame, (tracks, bboxes) in zip(frames, track_states):
            if anno['ID'] not in tracks:
                continue

            related_frames.append(frame)
            related_tracks.append(tracks[anno['ID']])

        instances_lst = self.scene_sensor.get_instances_with_feats(
            related_frames, get_full_fm=False)

        for frame, instances, track in zip(
                related_frames, instances_lst, related_tracks):
            _, inst_id = max_iou(track, instances, return_id=True)
            if inst_id == -1:
                warnings.warn(
                    'Cannot find corresponding instance for track in '
                    'anno: {}\n'.format(anno))
                continue

            x1, y1, x2, y2 = instances[inst_id]['bbox']
            cv2.imwrite(os.path.join(data_dir, '{}.jpg'.format(idx)),
                        frame[int(y1):int(y2), int(x1):int(x2)])
            np.save(os.path.join(data_dir, '{}.npy'.format(idx)),
                    instances[inst_id]['fm'])
            with open(txt, 'a') as f:
                if anno['Salutation'] == 'man':
                    tree_mask, cls0, cls1, cls2 = '100', 0, -1, -1
                elif anno['Salutation'] == 'woman':
                    tree_mask, cls0, cls1, cls2 = '100', 1, -1, -1
                elif anno['Salutation'] == 'young_boy':
                    tree_mask, cls0, cls1, cls2 = '110', 0, 0, -1
                elif anno['Salutation'] == 'uncle':
                    tree_mask, cls0, cls1, cls2 = '110', 0, 1, -1
                elif anno['Salutation'] == 'young_girl':
                    tree_mask, cls0, cls1, cls2 = '101', 1, -1, 0
                elif anno['Salutation'] == 'aunt':
                    tree_mask, cls0, cls1, cls2 = '101', 1, -1, 1

                f.write('{} {} {} {} {}\n'.format(
                    idx, tree_mask, cls0, cls1, cls2))

            idx += 1

        return idx

    def build_dataset(self, output_dir):
        train_dir = os.path.join(output_dir, 'train')
        test_dir = os.path.join(output_dir, 'test')
        train_txt = os.path.join(output_dir, 'train.txt')
        test_txt = os.path.join(output_dir, 'test.txt')

        if not os.path.exists(train_dir):
            os.makedirs(train_dir)
        if not os.path.exists(test_dir):
            os.makedirs(test_dir)

        for txt, data_dir, annos in zip(
                [test_txt, train_txt], [test_dir, train_dir],
                [self.test_annos, self.train_annos]):
            print('Generating {}'.format(txt))
            idx = 0
            for anno in annos:
                idx = self._process_single_anno(idx, anno, txt, data_dir)
                print(idx)
コード例 #8
0
    def worker_func(yolov4_model_dir, video_dir, anno_lst, data_queue,
                    msg_queue, conf_dict):
        video_aug = VideoAugmentorV2()
        scene_sensor = SceneSensor(
            yolov4_model_dir,
            gpu=conf_dict['gpu'],
            img_shape=[3, 416, 416],
            roi_feat_resolution=conf_dict['roi_feat_resolution'],
            algorithm='yolov4')

        def _process_neg_frames(anno):
            if check_passive_interaction(anno['Path']):
                # Ignore examples in which someone is interacting the robot
                return

            try:
                frames = read_all_frames(anno['Path'])
            except Exception:
                warnings.warn('OpenCV IO error. Reading {}'.format(
                    anno['Path']))
                return

            frames = sample_frames(frames, conf_dict['ob_window_len'])
            h, w, _ = frames[0].shape
            if h / w == 480 / 640:
                frames = [cv2.resize(i, (640, 480)) for i in frames]
            elif h / w == 720 / 1280:
                frames = [cv2.resize(i, (1280, 720)) for i in frames]

            instances_lst = scene_sensor.get_instances_with_feats(
                frames, get_full_fm=False)
            success, data = convert_instances_lst_to_data(
                instances_lst, conf_dict['tokens_per_frame'],
                {}, [], anno['WAE_id'],
                conf_dict['inst_crop_shape'], conf_dict['inst_fm_shape'],
                conf_dict['inst_pos_dim'], conf_dict['inst_cls_dim'],
                conf_dict['visual_token_dim'])
            if success:
                data_queue.put(data)
            else:
                warnings.warn(
                    'Failed to process annotation: {}\n'.format(anno))

        def _process_single_anno(anno):
            if anno['VideoType'] == 'neg_frames':
                _process_neg_frames(anno)
                return

            te = timestamp_to_ms(anno['Time'])
            ts = te - conf_dict['ob_window_len'] * conf_dict['interval']
            frames_dir = os.path.join(video_dir, anno['VideoID'])
            # print('=================', frames_dir, anno['VideoType'])
            if conf_dict['use_frames_first'] and os.path.isdir(frames_dir):
                # Read images
                try:
                    frames = read_frames_dir(frames_dir, max(0.0, ts), te)
                    frames = sample_frames(frames, conf_dict['ob_window_len'])
                    ctx_frames = read_frames_dir(frames_dir, 0.0, te)
                except Exception:
                    warnings.warn('OpenCV IO error. Reading {}'.format(
                        frames_dir))
                    return

                h, w, _ = frames[0].shape
                if h / w == 480 / 640:
                    frames = [cv2.resize(i, (640, 480)) for i in frames]
                elif h / w == 720 / 1280:
                    frames = [cv2.resize(i, (1280, 720)) for i in frames]
            else:
                # Read video
                video_file = os.path.join(
                    video_dir, '{}.mp4'.format(anno['VideoID']))
                frames = clip_video_to_frames(video_file, max(0.0, ts), te)
                frames = sample_frames(frames, conf_dict['ob_window_len'])
                ctx_frames = clip_video_to_frames(video_file, 0.0, te)

            track_states_file = os.path.join(
                video_dir, '{}_states.pkl'.format(anno['VideoID']))
            with open(track_states_file, 'rb') as f:
                track_states = pickle.load(f)
            last_frame_tracks = track_states[len(ctx_frames)-1][0]
            obj_ids = anno['ID'].split(',') if anno['ID'] != '' else []
            check_passed = True
            for idx in obj_ids:
                check_passed = check_passed and idx in last_frame_tracks
            if not check_passed:
                warnings.warn(
                    'Failed to process annotation: {}\n'.format(anno))
                return

            if conf_dict['augment']:
                while True:
                    aug_frames = video_aug(frames)
                    instances = scene_sensor.get_instances(aug_frames[-1:])[0]
                    iou_lst = [max_iou(last_frame_tracks[idx], instances)
                               for idx in obj_ids]
                    if len(iou_lst) == 0 or min(iou_lst) > 0.5:
                        break
                frames = aug_frames

            instances_lst = scene_sensor.get_instances_with_feats(
                frames, get_full_fm=False)
            success, data = convert_instances_lst_to_data(
                instances_lst, conf_dict['tokens_per_frame'],
                last_frame_tracks, obj_ids, anno['WAE_id'],
                conf_dict['inst_crop_shape'], conf_dict['inst_fm_shape'],
                conf_dict['inst_pos_dim'], conf_dict['inst_cls_dim'],
                conf_dict['visual_token_dim'])
            if success:
                data_queue.put(data)
            else:
                warnings.warn(
                    'Failed to process annotation: {}\n'.format(anno))

        while True:
            msg = msg_queue.get()

            if msg == 'stop':
                break
            elif msg == 'new_epoch':
                for anno in anno_lst:
                    if conf_dict['read_cache'] and 'VideoID' in anno:
                        cache_file = '{}_{}_cache.pkl'.format(
                            anno['VideoID'], stable_anno_hash(anno))
                        cache_file = os.path.join(video_dir, cache_file)
                        if os.path.exists(cache_file):
                            with open(cache_file, 'rb') as f:
                                data = pickle.load(f)
                                data_queue.put(data)
                        else:
                            _process_single_anno(anno)
                    else:
                        _process_single_anno(anno)
            elif len(msg) == 2 and msg[0] == 'update':
                anno_lst = msg[1]
コード例 #9
0
    def _process_single_anno(self, idx, anno, txt, data_dir):
        if not hasattr(self, "scene_sensor"):
            self.scene_sensor = SceneSensor(
                self.yolov4_model_dir,
                gpu=self.gpu,
                img_shape=[3, 416, 416],
                roi_feat_resolution=self.roi_feat_resolution,
                algorithm='yolov4')

        # Read annos and data
        track_states_file = os.path.join(
            self.video_tracking_dir, '{}_states.pkl'.format(anno['VideoID']))
        with open(track_states_file, 'rb') as f:
            track_states = pickle.load(f)

        video_file = os.path.join(
            self.video_tracking_dir, '{}.mp4'.format(anno['VideoID']))
        frames = clip_video_to_frames(video_file, 0.0, None)

        # Extract frames
        related_frames, related_tracks = [], []
        for frame, (tracks, bboxes) in zip(frames, track_states):
            if anno['ID'] not in tracks:
                continue

            related_frames.append(frame)
            related_tracks.append(tracks[anno['ID']])

        instances_lst = self.scene_sensor.get_instances_with_feats(
            related_frames, get_full_fm=False)

        for frame, instances, track in zip(
                related_frames, instances_lst, related_tracks):
            _, inst_id = max_iou(track, instances, return_id=True)
            if inst_id == -1:
                warnings.warn(
                    'Cannot find corresponding instance for track in '
                    'anno: {}\n'.format(anno))
                continue

            x1, y1, x2, y2 = instances[inst_id]['bbox']
            cv2.imwrite(os.path.join(data_dir, '{}.jpg'.format(idx)),
                        frame[int(y1):int(y2), int(x1):int(x2)])
            np.save(os.path.join(data_dir, '{}.npy'.format(idx)),
                    instances[inst_id]['fm'])
            with open(txt, 'a') as f:
                if anno['Salutation'] == 'man':
                    tree_mask, cls0, cls1, cls2 = '100', 0, -1, -1
                elif anno['Salutation'] == 'woman':
                    tree_mask, cls0, cls1, cls2 = '100', 1, -1, -1
                elif anno['Salutation'] == 'young_boy':
                    tree_mask, cls0, cls1, cls2 = '110', 0, 0, -1
                elif anno['Salutation'] == 'uncle':
                    tree_mask, cls0, cls1, cls2 = '110', 0, 1, -1
                elif anno['Salutation'] == 'young_girl':
                    tree_mask, cls0, cls1, cls2 = '101', 1, -1, 0
                elif anno['Salutation'] == 'aunt':
                    tree_mask, cls0, cls1, cls2 = '101', 1, -1, 1

                f.write('{} {} {} {} {}\n'.format(
                    idx, tree_mask, cls0, cls1, cls2))

            idx += 1

        return idx
コード例 #10
0
 def setup_class(self):
     self.scene_sensor = SceneSensor(YOLOv3_MODEL,
                                     gpu=0,
                                     algorithm='yolov3')
     self.frames = clip_video_to_frames(VIDEO, 3001., 4000.)