def main():
    args = build_argparser().parse_args()

    log.info('Initializing Inference Engine...')
    ie = IECore()

    config_user_specified, config_min_latency = get_plugin_configs(
        args.device, args.num_streams, args.num_threads)

    labels_map = None
    if args.labels:
        with open(args.labels, 'r') as f:
            labels_map = [x.strip() for x in f]

    log.info('Loading network...')
    completed_request_results = {}
    modes = cycle(Modes)
    prev_mode = mode = next(modes)
    log.info('Using {} mode'.format(mode.name))
    mode_info = {mode: ModeInfo()}
    exceptions = []

    detectors = {
        Modes.USER_SPECIFIED:
        Detector(ie,
                 args.model,
                 device=args.device,
                 plugin_config=config_user_specified,
                 results=completed_request_results,
                 max_num_requests=args.num_infer_requests,
                 labels_map=labels_map,
                 keep_aspect_ratio_resize=args.keep_aspect_ratio,
                 caught_exceptions=exceptions),
        Modes.MIN_LATENCY:
        Detector(ie,
                 args.model,
                 device=args.device.split(':')[-1].split(',')[0],
                 plugin_config=config_min_latency,
                 results=completed_request_results,
                 max_num_requests=1,
                 labels_map=labels_map,
                 keep_aspect_ratio_resize=args.keep_aspect_ratio,
                 caught_exceptions=exceptions)
    }

    try:
        input_stream = int(args.input)
    except ValueError:
        input_stream = args.input
    cap = cv2.VideoCapture(input_stream)
    wait_key_time = 1

    next_frame_id = 0
    next_frame_id_to_show = 0
    input_repeats = 0

    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    print(
        "To switch between min_latency/user_specified modes, press TAB key in the output window"
    )

    palette = ColorPalette(len(labels_map) if labels_map is not None else 100)
    presenter = monitors.Presenter(
        args.utilization_monitors, 55,
        (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4),
         round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8)))

    while (cap.isOpened() \
           or completed_request_results \
           or len(detectors[mode].empty_requests) < len(detectors[mode].requests)) \
          and not exceptions:
        if next_frame_id_to_show in completed_request_results:
            frame_meta, raw_outputs = completed_request_results.pop(
                next_frame_id_to_show)
            objects = detectors[mode].postprocess(raw_outputs, frame_meta)

            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if len(objects) and args.raw_output_message:
                log.info(' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ')

            origin_im_size = frame.shape[:-1]
            presenter.drawGraphs(frame)
            for obj in objects:
                if obj.score > args.prob_threshold:
                    xmin = max(int(obj.xmin), 0)
                    ymin = max(int(obj.ymin), 0)
                    xmax = min(int(obj.xmax), origin_im_size[1])
                    ymax = min(int(obj.ymax), origin_im_size[0])
                    class_id = int(obj.class_id)
                    color = palette[class_id]
                    det_label = labels_map[class_id] if labels_map and len(
                        labels_map) >= class_id else str(class_id)

                    if args.raw_output_message:
                        log.info('{:^9} | {:10f} | {:4} | {:4} | {:4} | {:4} '.
                                 format(det_label, obj.score, xmin, ymin, xmax,
                                        ymax))

                    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2)
                    cv2.putText(frame,
                                '#{} {:.1%}'.format(det_label, obj.score),
                                (xmin, ymin - 7), cv2.FONT_HERSHEY_COMPLEX,
                                0.6, color, 1)

            mode_message = '{} mode'.format(mode.name)
            put_highlighted_text(frame, mode_message,
                                 (10, int(origin_im_size[0] - 20)),
                                 cv2.FONT_HERSHEY_COMPLEX, 0.75, (10, 10, 200),
                                 2)

            next_frame_id_to_show += 1
            if prev_mode == mode:
                mode_info[mode].frames_count += 1
            elif len(completed_request_results) == 0:
                mode_info[prev_mode].last_end_time = perf_counter()
                prev_mode = mode

            # Frames count is always zero if mode has just been switched (i.e. prev_mode != mode).
            if mode_info[mode].frames_count != 0:
                fps_message = 'FPS: {:.1f}'.format(mode_info[mode].frames_count / \
                                                   (perf_counter() - mode_info[mode].last_start_time))
                mode_info[mode].latency_sum += perf_counter() - start_time
                latency_message = 'Latency: {:.1f} ms'.format((mode_info[mode].latency_sum / \
                                                              mode_info[mode].frames_count) * 1e3)
                # Draw performance stats over frame.
                put_highlighted_text(frame, fps_message, (15, 20),
                                     cv2.FONT_HERSHEY_COMPLEX, 0.75,
                                     (200, 10, 10), 2)
                put_highlighted_text(frame, latency_message, (15, 50),
                                     cv2.FONT_HERSHEY_COMPLEX, 0.75,
                                     (200, 10, 10), 2)

            if not args.no_show:
                cv2.imshow('Detection Results', frame)
                key = cv2.waitKey(wait_key_time)

                ESC_KEY = 27
                TAB_KEY = 9
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                # Switch mode.
                # Disable mode switch if the previous switch has not been finished yet.
                if key == TAB_KEY and mode_info[mode].frames_count > 0:
                    mode = next(modes)
                    detectors[prev_mode].await_all()
                    mode_info[prev_mode].last_end_time = perf_counter()
                    mode_info[mode] = ModeInfo()
                    log.info('Using {} mode'.format(mode.name))
                else:
                    presenter.handleKey(key)

        elif detectors[mode].empty_requests and cap.isOpened():
            start_time = perf_counter()
            ret, frame = cap.read()
            if not ret:
                if input_repeats < args.loop or args.loop < 0:
                    cap.open(input_stream)
                    input_repeats += 1
                else:
                    cap.release()
                continue

            detectors[mode](frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1

        else:
            detectors[mode].await_any()

    if exceptions:
        raise exceptions[0]

    for exec_net in detectors.values():
        exec_net.await_all()

    for mode_value, mode_stats in mode_info.items():
        log.info('')
        log.info('Mode: {}'.format(mode_value.name))

        end_time = mode_stats.last_end_time if mode_stats.last_end_time is not None \
                                            else perf_counter()
        log.info('FPS: {:.1f}'.format(mode_stats.frames_count / \
                                      (end_time - mode_stats.last_start_time)))
        log.info('Latency: {:.1f} ms'.format((mode_stats.latency_sum / \
                                             mode_stats.frames_count) * 1e3))
    print(presenter.reportMeans())
def main():
    args = build_argparser().parse_args()
    if args.architecture_type != 'yolov4' and args.anchors:
        log.warning(
            'The "--anchors" option works only for "-at==yolov4". Option will be omitted'
        )
    if args.architecture_type != 'yolov4' and args.masks:
        log.warning(
            'The "--masks" option works only for "-at==yolov4". Option will be omitted'
        )
    if args.architecture_type not in ['nanodet', 'nanodet-plus'
                                      ] and args.num_classes:
        log.warning(
            'The "--num_classes" option works only for "-at==nanodet" and "-at==nanodet-plus". Option will be omitted'
        )

    cap = open_images_capture(args.input, args.loop)

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests,
            model_parameters={'input_layouts': args.layout})
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    configuration = {
        'resize_type': args.resize_type,
        'mean_values': args.mean_values,
        'scale_values': args.scale_values,
        'reverse_input_channels': args.reverse_input_channels,
        'path_to_labels': args.labels,
        'confidence_threshold': args.prob_threshold,
        'input_size': args.input_size,  # The CTPN specific
        'num_classes':
        args.num_classes,  # The NanoDet and NanoDetPlus specific
    }
    model = DetectionModel.create_model(args.architecture_type, model_adapter,
                                        configuration)
    model.log_layers_info()

    detector_pipeline = AsyncPipeline(model)

    next_frame_id = 0
    next_frame_id_to_show = 0

    palette = ColorPalette(len(model.labels) if model.labels else 100)
    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    video_writer = cv2.VideoWriter()

    while True:
        if detector_pipeline.callback_exceptions:
            raise detector_pipeline.callback_exceptions[0]
        # Process all completed requests
        results = detector_pipeline.get_result(next_frame_id_to_show)
        if results:
            objects, frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if len(objects) and args.raw_output_message:
                print_raw_results(objects, model.labels, next_frame_id_to_show)

            presenter.drawGraphs(frame)
            rendering_start_time = perf_counter()
            frame = draw_detections(frame, objects, palette, model.labels,
                                    output_transform)
            render_metrics.update(rendering_start_time)
            metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow('Detection Results', frame)
                key = cv2.waitKey(1)

                ESC_KEY = 27
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)
            continue

        if detector_pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                output_transform = OutputTransform(frame.shape[:2],
                                                   args.output_resolution)
                if args.output_resolution:
                    output_resolution = output_transform.new_resolution
                else:
                    output_resolution = (frame.shape[1], frame.shape[0])
                presenter = monitors.Presenter(
                    args.utilization_monitors, 55,
                    (round(output_resolution[0] / 4),
                     round(output_resolution[1] / 8)))
                if args.output and not video_writer.open(
                        args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                        cap.fps(), output_resolution):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            detector_pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1
        else:
            # Wait for empty request
            detector_pipeline.await_any()

    detector_pipeline.await_all()
    if detector_pipeline.callback_exceptions:
        raise detector_pipeline.callback_exceptions[0]
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = detector_pipeline.get_result(next_frame_id_to_show)
        objects, frame_meta = results
        frame = frame_meta['frame']
        start_time = frame_meta['start_time']

        if len(objects) and args.raw_output_message:
            print_raw_results(objects, model.labels, next_frame_id_to_show)

        presenter.drawGraphs(frame)
        rendering_start_time = perf_counter()
        frame = draw_detections(frame, objects, palette, model.labels,
                                output_transform)
        render_metrics.update(rendering_start_time)
        metrics.update(start_time, frame)

        if video_writer.isOpened() and (
                args.output_limit <= 0
                or next_frame_id_to_show <= args.output_limit - 1):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Detection Results', frame)
            key = cv2.waitKey(1)

            ESC_KEY = 27
            # Quit.
            if key in {ord('q'), ord('Q'), ESC_KEY}:
                break
            presenter.handleKey(key)

    metrics.log_total()
    log_latency_per_stage(cap.reader_metrics.get_latency(),
                          detector_pipeline.preprocess_metrics.get_latency(),
                          detector_pipeline.inference_metrics.get_latency(),
                          detector_pipeline.postprocess_metrics.get_latency(),
                          render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    target_bgr = open_images_capture(args.target_bgr, loop=True) if args.target_bgr else None

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams, args.num_threads)
        model_adapter = OpenvinoAdapter(create_core(), args.model, device=args.device, plugin_config=plugin_config,
                                        max_num_requests=args.num_infer_requests, model_parameters = {'input_layouts': args.layout})
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    labels = ['__background__', 'person'] if args.labels is None else load_labels(args.labels)
    assert len(labels), 'The file with class labels is empty'

    configuration = {
        'confidence_threshold': args.prob_threshold,
        'resize_type': args.resize_type
    }

    model, need_bgr_input = get_model(model_adapter, configuration, args)

    input_bgr = open_images_capture(args.background, False).read() if need_bgr_input else None

    person_id = -1
    for i, label in enumerate(labels):
        if label == 'person':
            person_id = i
            break
    assert person_id >= 0, 'Person class did not find in labels list.'

    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    next_frame_id = 0
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    video_writer = cv2.VideoWriter()
    while True:
        if pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            bgr = target_bgr.read() if target_bgr is not None else None
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                output_transform = OutputTransform(frame.shape[:2], args.output_resolution)
                if args.output_resolution:
                    output_resolution = output_transform.new_resolution
                else:
                    output_resolution = (frame.shape[1], frame.shape[0])
                presenter = monitors.Presenter(args.utilization_monitors, 55,
                                               (round(output_resolution[0] / 4), round(output_resolution[1] / 8)))
                if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                                                         cap.fps(), tuple(output_resolution)):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            data = {'src': frame, 'bgr': input_bgr} if input_bgr is not None else frame
            pipeline.submit_data(data, next_frame_id, {'frame': frame, 'start_time': start_time})
            next_frame_id += 1
        else:
            # Wait for empty request
            pipeline.await_any()

        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        # Process all completed requests
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            objects, frame_meta = results
            if args.raw_output_message:
                print_raw_results(objects, next_frame_id_to_show)
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']
            rendering_start_time = perf_counter()
            frame = render_results(frame, objects, output_resolution, bgr, person_id,
                                   args.blur_bgr, args.show_with_original_frame)
            render_metrics.update(rendering_start_time)
            presenter.drawGraphs(frame)
            metrics.update(start_time, frame)

            if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit-1):
                video_writer.write(frame)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow('Background subtraction results', frame)
                key = cv2.waitKey(1)
                if key == 27 or key == 'q' or key == 'Q':
                    break
                presenter.handleKey(key)

    pipeline.await_all()
    if pipeline.callback_exceptions:
        raise pipeline.callback_exceptions[0]
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = pipeline.get_result(next_frame_id_to_show)
        objects, frame_meta = results
        if args.raw_output_message:
            print_raw_results(objects, next_frame_id_to_show, model.labels)
        frame = frame_meta['frame']
        start_time = frame_meta['start_time']

        rendering_start_time = perf_counter()
        frame = render_results(frame, objects, output_resolution, bgr, person_id,
                               args.blur_bgr, args.show_with_original_frame)
        render_metrics.update(rendering_start_time)
        presenter.drawGraphs(frame)
        metrics.update(start_time, frame)

        if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit-1):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Background subtraction results', frame)
            cv2.waitKey(1)

    metrics.log_total()
    log_latency_per_stage(cap.reader_metrics.get_latency(),
                          pipeline.preprocess_metrics.get_latency(),
                          pipeline.inference_metrics.get_latency(),
                          pipeline.postprocess_metrics.get_latency(),
                          render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
Ejemplo n.º 4
0
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)
    next_frame_id = 1
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    video_writer = cv2.VideoWriter()

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests,
            model_parameters={'input_layouts': args.layout})
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    model = Deblurring(model_adapter, preload=False)
    model.reshape(frame.shape)
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    pipeline.submit_data(frame, 0, {'frame': frame, 'start_time': start_time})

    presenter = monitors.Presenter(
        args.utilization_monitors, 55,
        (round(frame.shape[1] / 4), round(frame.shape[0] / 8)))
    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (2 * frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    while True:
        if pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                break

            # Submit for inference
            pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1
        else:
            # Wait for empty request
            pipeline.await_any()

        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        # Process all completed requests
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            result_frame, frame_meta = results
            input_frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            rendering_start_time = perf_counter()
            if input_frame.shape != result_frame.shape:
                input_frame = cv2.resize(
                    input_frame,
                    (result_frame.shape[1], result_frame.shape[0]))
            final_image = cv2.hconcat([input_frame, result_frame])
            render_metrics.update(rendering_start_time)

            presenter.drawGraphs(final_image)
            metrics.update(start_time, final_image)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(final_image)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow('Deblurring Results', final_image)
                key = cv2.waitKey(1)
                if key == 27 or key == 'q' or key == 'Q':
                    break
                presenter.handleKey(key)

    pipeline.await_all()
    if pipeline.callback_exceptions:
        raise pipeline.callback_exceptions[0]
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = pipeline.get_result(next_frame_id_to_show)
        result_frame, frame_meta = results
        input_frame = frame_meta['frame']
        start_time = frame_meta['start_time']

        rendering_start_time = perf_counter()
        if input_frame.shape != result_frame.shape:
            input_frame = cv2.resize(
                input_frame, (result_frame.shape[1], result_frame.shape[0]))
        final_image = cv2.hconcat([input_frame, result_frame])
        render_metrics.update(rendering_start_time)

        presenter.drawGraphs(final_image)
        metrics.update(start_time, final_image)

        if video_writer.isOpened() and (
                args.output_limit <= 0
                or next_frame_id_to_show <= args.output_limit - 1):
            video_writer.write(final_image)

        if not args.no_show:
            cv2.imshow('Deblurring Results', final_image)
            key = cv2.waitKey(1)

    metrics.log_total()
    log_latency_per_stage(cap.reader_metrics.get_latency(),
                          pipeline.preprocess_metrics.get_latency(),
                          pipeline.inference_metrics.get_latency(),
                          pipeline.postprocess_metrics.get_latency(),
                          render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    parser = argparse.ArgumentParser(description='Whiteboard inpainting demo')
    parser.add_argument('-i', '--input', required=True,
                         help='Required. Path to a video file or a device node of a web-camera.')
    parser.add_argument('--loop', default=False, action='store_true',
                        help='Optional. Enable reading the input in a loop.')
    parser.add_argument('-o', '--output', required=False,
                        help='Optional. Name of the output file(s) to save.')
    parser.add_argument('-limit', '--output_limit', required=False, default=1000, type=int,
                        help='Optional. Number of frames to store in output. '
                             'If 0 is set, all frames are stored.')
    parser.add_argument('-m_i', '--m_instance_segmentation', type=str, required=False,
                        help='Required. Path to the instance segmentation model.')
    parser.add_argument('-m_s', '--m_semantic_segmentation', type=str, required=False,
                        help='Required. Path to the semantic segmentation model.')
    parser.add_argument('-t', '--threshold', type=float, default=0.6,
                        help='Optional. Threshold for person instance segmentation model.')
    parser.add_argument('--no_show', help="Optional. Don't show output.", action='store_true')
    parser.add_argument('-d', '--device', type=str, default='CPU',
                        help='Optional. Specify a target device to infer on. CPU, GPU, HDDL or MYRIAD is '
                             'acceptable. The demo will look for a suitable plugin for the device specified.')
    parser.add_argument('-l', '--cpu_extension', type=str, default=None,
                        help='MKLDNN (CPU)-targeted custom layers. Absolute \
                              path to a shared library with the kernels impl.')
    parser.add_argument('-u', '--utilization_monitors', default='', type=str,
                        help='Optional. List of monitors to show initially.')
    args = parser.parse_args()

    cap = open_images_capture(args.input, args.loop)
    if cap.get_type() not in ('VIDEO', 'CAMERA'):
        raise RuntimeError("The input should be a video file or a numeric camera ID")

    if bool(args.m_instance_segmentation) == bool(args.m_semantic_segmentation):
        raise ValueError('Set up exactly one of segmentation models: '
                         '--m_instance_segmentation or --m_semantic_segmentation')

    labels_dir = Path(__file__).resolve().parents[3] / 'data/dataset_classes'
    mouse = MouseClick()
    if not args.no_show:
        cv2.namedWindow(WINNAME)
        cv2.setMouseCallback(WINNAME, mouse.get_points)

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    model_path = args.m_instance_segmentation if args.m_instance_segmentation else args.m_semantic_segmentation
    log.info('Reading model {}'.format(model_path))
    if args.m_instance_segmentation:
        labels_file = str(labels_dir / 'coco_80cl_bkgr.txt')
        segmentation = MaskRCNN(core, args.m_instance_segmentation, labels_file,
                                args.threshold, args.device, args.cpu_extension)
    elif args.m_semantic_segmentation:
        labels_file = str(labels_dir / 'cityscapes_19cl_bkgr.txt')
        segmentation = SemanticSegmentation(core, args.m_semantic_segmentation, labels_file,
                                            args.threshold, args.device, args.cpu_extension)
    log.info('The model {} is loaded to {}'.format(model_path, args.device))

    metrics = PerformanceMetrics()
    video_writer = cv2.VideoWriter()
    black_board = False
    frame_number = 0
    key = -1

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    out_frame_size = (frame.shape[1], frame.shape[0] * 2)
    output_frame = np.full((frame.shape[0], frame.shape[1], 3), 255, dtype='uint8')
    presenter = monitors.Presenter(args.utilization_monitors, 20,
                                   (out_frame_size[0] // 4, out_frame_size[1] // 16))
    if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                                             cap.fps(), out_frame_size):
        raise RuntimeError("Can't open video writer")

    while frame is not None:
        mask = None
        detections = segmentation.get_detections([frame])
        expand_mask(detections, frame.shape[1] // 27)
        if len(detections[0]) > 0:
            mask = detections[0][0][2]
            for i in range(1, len(detections[0])):
                mask = cv2.bitwise_or(mask, detections[0][i][2])

        if mask is not None:
            mask = np.stack([mask, mask, mask], axis=-1)
        else:
            mask = np.zeros(frame.shape, dtype='uint8')

        clear_frame = remove_background(frame, invert_colors=not black_board)

        output_frame = np.where(mask, output_frame, clear_frame)
        merged_frame = np.vstack([frame, output_frame])
        merged_frame = cv2.resize(merged_frame, out_frame_size)

        metrics.update(start_time, merged_frame)

        if video_writer.isOpened() and (args.output_limit <= 0 or frame_number <= args.output_limit-1):
            video_writer.write(merged_frame)

        presenter.drawGraphs(merged_frame)
        if not args.no_show:
            cv2.imshow(WINNAME, merged_frame)
            key = check_pressed_keys(key)
            if key == 27:  # 'Esc'
                break
            if key == ord('i'):  # catch pressing of key 'i'
                black_board = not black_board
                output_frame = 255 - output_frame
            else:
                presenter.handleKey(key)

        if mouse.crop_available:
            x0, x1 = min(mouse.points[0][0], mouse.points[1][0]), \
                     max(mouse.points[0][0], mouse.points[1][0])
            y0, y1 = min(mouse.points[0][1], mouse.points[1][1]), \
                     max(mouse.points[0][1], mouse.points[1][1])
            x1, y1 = min(x1, output_frame.shape[1] - 1), min(y1, output_frame.shape[0] - 1)
            board = output_frame[y0: y1, x0: x1, :]
            if board.shape[0] > 0 and board.shape[1] > 0:
                cv2.namedWindow('Board', cv2.WINDOW_KEEPRATIO)
                cv2.imshow('Board', board)

        frame_number += 1
        start_time = perf_counter()
        frame = cap.read()

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    with open(args.labels, 'rt') as labels_file:
        class_labels = labels_file.read().splitlines()
        assert len(class_labels), 'The file with class labels is empty'

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()
    if args.cpu_extension and 'CPU' in args.device:
        core.add_extension(args.cpu_extension, 'CPU')

    # Read IR
    log.info('Reading model {}'.format(args.model))
    model = core.read_model(args.model)
    image_input, image_info_input, (
        n, c, h,
        w), model_type, output_names, postprocessor = check_model(model)
    args.no_keep_aspect_ratio = model_type == 'yolact' or args.no_keep_aspect_ratio

    compiled_model = core.compile_model(model, args.device)
    infer_request = compiled_model.create_infer_request()
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    if args.delay:
        delay = args.delay
    else:
        delay = int(cap.get_type() in ('VIDEO', 'CAMERA'))

    frames_processed = 0
    metrics = PerformanceMetrics()
    visualizer = Visualizer(class_labels,
                            show_boxes=args.show_boxes,
                            show_scores=args.show_scores)
    video_writer = cv2.VideoWriter()

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    out_frame_size = (frame.shape[1], frame.shape[0])
    presenter = monitors.Presenter(
        args.utilization_monitors, 45,
        (round(out_frame_size[0] / 4), round(out_frame_size[1] / 8)))
    if args.output and not video_writer.open(args.output,
                                             cv2.VideoWriter_fourcc(*'MJPG'),
                                             cap.fps(), out_frame_size):
        raise RuntimeError("Can't open video writer")

    while frame is not None:
        if args.no_keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image,
                             ((0, h - input_image_size[0]),
                              (0, w - input_image_size[1]), (0, 0)),
                             mode='constant',
                             constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)
        input_image_info = np.asarray(
            [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32)

        # Run the model.
        feed_dict = {image_input: input_image}
        if image_info_input:
            feed_dict[image_info_input] = input_image_info

        infer_request.infer(feed_dict)
        outputs = {
            name: infer_request.get_tensor(name).data[:]
            for name in output_names
        }

        # Parse detection results of the current request
        scores, classes, boxes, masks = postprocessor(outputs, scale_x,
                                                      scale_y,
                                                      *frame.shape[:2], h, w,
                                                      args.prob_threshold)

        if len(boxes) and args.raw_output_message:
            log.debug(
                '  -------------------------- Frame # {} --------------------------  '
                .format(frames_processed))
            log.debug(
                '  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX '
            )
            for box, cls, score in zip(boxes, classes, scores):
                log.debug(
                    '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '
                    .format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, presenter, masks,
                           masks_tracks_ids)

        metrics.update(start_time, frame)

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(frame)

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)

        if not args.no_show:
            key = cv2.waitKey(delay)
            esc_code = 27
            if key == esc_code:
                break
            presenter.handleKey(key)
        start_time = perf_counter()
        frame = cap.read()

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
Ejemplo n.º 7
0
def main():
    args = build_argparser().parse_args()

    if args.labels:
        with open(args.labels) as f:
            labels = [line.strip() for line in f]
    else:
        labels = None

    ie = IECore()

    if 'MYRIAD' in args.device:
        myriad_config = {'VPU_HW_STAGES_OPTIMIZATION': 'YES'}
        ie.set_config(myriad_config, 'MYRIAD')

    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')

    decoder_target_device = 'CPU'
    if args.device != 'CPU':
        encoder_target_device = args.device
    else:
        encoder_target_device = decoder_target_device

    encoder_xml = args.m_encoder
    encoder_bin = args.m_encoder.replace('.xml', '.bin')
    models = [
        IEModel(encoder_xml,
                encoder_bin,
                ie,
                encoder_target_device,
                num_requests=(3 if args.device == 'MYRIAD' else 1))
    ]

    if args.architecture_type == 'en-de':
        if args.m_decoder is None:
            raise RuntimeError(
                'No decoder for encoder-decoder model type (-m_de) provided')
        decoder_xml = args.m_decoder
        decoder_bin = args.m_decoder.replace('.xml', '.bin')
        models.append(
            IEModel(decoder_xml,
                    decoder_bin,
                    ie,
                    decoder_target_device,
                    num_requests=2))
        seq_size = models[1].input_size[1]
    elif args.architecture_type == 'en-mean':
        models.append(DummyDecoder(num_requests=2))
        seq_size = args.decoder_seq_size
    elif args.architecture_type == 'i3d-rgb':
        seq_size = models[0].input_size[2]

    presenter = monitors.Presenter(args.utilization_monitors, 70)
    result_presenter = ResultRenderer(
        no_show=args.no_show,
        presenter=presenter,
        output=args.output,
        limit=args.output_limit,
        labels=labels,
        label_smoothing_window=args.label_smoothing)
    cap = open_images_capture(args.input, args.loop)
    run_pipeline(cap,
                 args.architecture_type,
                 models,
                 result_presenter.render_frame,
                 seq_size=seq_size,
                 fps=cap.fps())
    print(presenter.reportMeans())
Ejemplo n.º 8
0
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)
    frame_processor = FrameProcessor(args)

    frame_num = 0
    metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    input_crop = None
    if args.crop_size[0] > 0 and args.crop_size[1] > 0:
        input_crop = np.array(args.crop_size)
    elif not (args.crop_size[0] == 0 and args.crop_size[1] == 0):
        raise ValueError('Both crop height and width should be positive')
    video_writer = cv2.VideoWriter()

    while True:
        start_time = perf_counter()
        frame = cap.read()
        if frame is None:
            if frame_num == 0:
                raise ValueError("Can't read an image from the input")
            break
        if input_crop:
            frame = center_crop(frame, input_crop)
        if frame_num == 0:
            output_transform = OutputTransform(frame.shape[:2],
                                               args.output_resolution)
            if args.output_resolution:
                output_resolution = output_transform.new_resolution
            else:
                output_resolution = (frame.shape[1], frame.shape[0])
            presenter = monitors.Presenter(args.utilization_monitors, 55,
                                           (round(output_resolution[0] / 4),
                                            round(output_resolution[1] / 8)))
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
                    output_resolution):
                raise RuntimeError("Can't open video writer")

        detections = frame_processor.process(frame)
        presenter.drawGraphs(frame)
        frame = draw_detections(frame, frame_processor, detections,
                                output_transform)
        metrics.update(start_time, frame)

        frame_num += 1
        if video_writer.isOpened() and (args.output_limit <= 0
                                        or frame_num <= args.output_limit):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Face recognition demo', frame)
            key = cv2.waitKey(1)
            # Quit
            if key in {ord('q'), ord('Q'), 27}:
                break
            presenter.handleKey(key)

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
Ejemplo n.º 9
0
def main():
    """ Main function. """

    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    class_map = load_class_map(args.class_map)
    assert class_map is not None

    ie_core = load_ie_core(args.device, args.cpu_extension)

    person_detector = PersonDetector(args.detection_model,
                                     args.device,
                                     ie_core,
                                     num_requests=2,
                                     output_shape=DETECTOR_OUTPUT_SHAPE)
    action_recognizer = ActionRecognizer(args.action_model,
                                         args.device,
                                         ie_core,
                                         num_requests=2,
                                         img_scale=ACTION_IMAGE_SCALE,
                                         num_classes=len(class_map))
    person_tracker = Tracker(person_detector, TRACKER_SCORE_THRESHOLD,
                             TRACKER_IOU_THRESHOLD)

    video_stream = VideoStream(args.input, ACTION_NET_INPUT_FPS,
                               action_recognizer.input_length)
    video_stream.start()

    visualizer = Visualizer(VISUALIZER_TRG_FPS)
    visualizer.register_window('Demo')
    presenter = monitors.Presenter(args.utilization_monitors)

    samples_library = None
    if args.samples_dir is not None and os.path.exists(args.samples_dir):
        visualizer.register_window('Gesture library')
        visualizer.start()

        library_queue = visualizer.get_queue('Gesture library')
        samples_library = VideoLibrary(args.samples_dir,
                                       SAMPLES_MAX_WINDOW_SIZE,
                                       list(class_map.values()), library_queue,
                                       SAMPLES_TRG_FPS)
        samples_library.start()
    else:
        visualizer.start()

    last_caption = None
    active_object_id = -1
    tracker_labels_map = dict()
    tracker_labels = set()

    frames_processed = 0

    start_time = time.perf_counter()
    while True:
        frame = video_stream.get_live_frame()
        batch = video_stream.get_batch()
        if frame is None or batch is None:
            break
        if frames_processed == 0:
            video_writer = cv2.VideoWriter()
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                    video_stream.fps(), (frame.shape[1], frame.shape[0])):
                raise RuntimeError("Can't open video writer")

        detections, tracker_labels_map = person_tracker.add_frame(
            frame, len(OBJECT_IDS), tracker_labels_map)
        if detections is None:
            active_object_id = -1
            last_caption = None

        if len(detections) == 1:
            active_object_id = 0

        if active_object_id >= 0:
            cur_det = [det for det in detections if det.id == active_object_id]
            if len(cur_det) != 1:
                active_object_id = -1
                last_caption = None
                continue

            recognizer_result = action_recognizer(batch,
                                                  cur_det[0].roi.reshape(-1))
            if recognizer_result is not None:
                action_class_id = np.argmax(recognizer_result)
                action_class_label = \
                    class_map[action_class_id] if class_map is not None else action_class_id

                action_class_score = np.max(recognizer_result)
                if action_class_score > args.action_threshold:
                    last_caption = 'Last gesture: {} '.format(
                        action_class_label)

        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        start_time = end_time
        presenter.drawGraphs(frame)
        if active_object_id >= 0:
            current_fps = 1.0 / elapsed_time
            cv2.putText(frame, 'FPS: {:.2f}'.format(current_fps), (10, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

        if detections is not None:
            tracker_labels = set(det.id for det in detections)

            for det in detections:
                roi_color = (0, 255,
                             0) if active_object_id == det.id else (128, 128,
                                                                    128)
                border_width = 2 if active_object_id == det.id else 1
                person_roi = det.roi[0]
                cv2.rectangle(frame, (person_roi[0], person_roi[1]),
                              (person_roi[2], person_roi[3]), roi_color,
                              border_width)
                cv2.putText(frame, str(det.id),
                            (person_roi[0] + 10, person_roi[1] + 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, roi_color, 2)

        if last_caption is not None:
            cv2.putText(frame, last_caption, (10, frame.shape[0] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(frame)

        if args.no_show:
            continue

        visualizer.put_queue(frame, 'Demo')
        key = visualizer.get_key()

        if key == 27:  # esc
            break
        elif key == ord(' '):  # space
            active_object_id = -1
            last_caption = None
        elif key == 13:  # enter
            last_caption = None
        elif key in OBJECT_IDS:  # 0-9
            local_bbox_id = int(chr(key))
            if local_bbox_id in tracker_labels:
                active_object_id = local_bbox_id
        else:
            presenter.handleKey(key)

        if samples_library is not None:
            if key == ord('f'):  # forward
                samples_library.next()
            elif key == ord('b'):  # backward
                samples_library.prev()

    if samples_library is not None:
        samples_library.release()
    visualizer.release()
    video_stream.release()
    print(presenter.reportMeans())
Ejemplo n.º 10
0
def main():
    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    log.info("Creating Inference Engine...")
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, "CPU")
    # Read IR
    log.info("Loading network")
    net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin")

    if "CPU" in args.device:
        supported_layers = ie.query_network(net, "CPU")
        not_supported_layers = [l for l in net.layers.keys() if l not in supported_layers]
        if len(not_supported_layers) != 0:
            log.error("Following layers are not supported by the plugin for specified device {}:\n {}".
                      format(args.device, ', '.join(not_supported_layers)))
            log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l "
                      "or --cpu_extension command line argument")
            sys.exit(1)

    img_info_input_blob = None
    feed_dict = {}
    for blob_name in net.input_info:
        if len(net.input_info[blob_name].input_data.shape) == 4:
            input_blob = blob_name
        elif len(net.input_info[blob_name].input_data.shape) == 2:
            img_info_input_blob = blob_name
        else:
            raise RuntimeError("Unsupported {}D input layer '{}'. Only 2D and 4D input layers are supported"
                               .format(len(net.input_info[blob_name].input_data.shape), blob_name))

    output_postprocessor = get_output_postprocessor(net)

    log.info("Loading IR to the plugin...")
    exec_net = ie.load_network(network=net, num_requests=2, device_name=args.device)
    # Read and pre-process input image
    n, c, h, w = net.input_info[input_blob].input_data.shape
    if img_info_input_blob:
        feed_dict[img_info_input_blob] = [h, w, 1]

    if args.input == 'cam':
        input_stream = 0
    else:
        input_stream = args.input
    cap = cv2.VideoCapture(input_stream)
    assert cap.isOpened(), "Can't open " + str(input_stream)

    if args.labels:
        with open(args.labels, 'r') as f:
            labels_map = [x.strip() for x in f]
    else:
        labels_map = None

    cur_request_id = 0
    next_request_id = 1

    log.info("Starting inference in async mode...")
    is_async_mode = True
    render_time = 0
    if is_async_mode:
        ret, frame = cap.read()
        frame_h, frame_w = frame.shape[:2]

    presenter = monitors.Presenter(args.utilization_monitors, 45,
        (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8)))

    print("To close the application, press 'CTRL+C' here or switch to the output window and press ESC key")
    print("To switch between sync/async modes, press TAB key in the output window")

    while cap.isOpened():
        if is_async_mode:
            ret, next_frame = cap.read()
        else:
            ret, frame = cap.read()
            if ret:
                frame_h, frame_w = frame.shape[:2]
        if not ret:
            break  # abandons the last frame in case of async_mode
        # Main sync point:
        # in the truly Async mode we start the NEXT infer request, while waiting for the CURRENT to complete
        # in the regular mode we start the CURRENT request and immediately wait for it's completion
        inf_start = time.time()
        if is_async_mode:
            in_frame = cv2.resize(next_frame, (w, h))
            in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW
            in_frame = in_frame.reshape((n, c, h, w))
            feed_dict[input_blob] = in_frame
            exec_net.start_async(request_id=next_request_id, inputs=feed_dict)
        else:
            in_frame = cv2.resize(frame, (w, h))
            in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW
            in_frame = in_frame.reshape((n, c, h, w))
            feed_dict[input_blob] = in_frame
            exec_net.start_async(request_id=cur_request_id, inputs=feed_dict)
        if exec_net.requests[cur_request_id].wait(-1) == 0:
            inf_end = time.time()
            det_time = inf_end - inf_start

            # Parse detection results of the current request
            for obj in output_postprocessor(exec_net.requests[cur_request_id].output_blobs):
                # Draw only objects when probability more than specified threshold
                if obj[2] > args.prob_threshold:
                    xmin = int(obj[3] * frame_w)
                    ymin = int(obj[4] * frame_h)
                    xmax = int(obj[5] * frame_w)
                    ymax = int(obj[6] * frame_h)
                    class_id = int(obj[1])
                    # Draw box and label\class_id
                    color = (min(class_id * 12.5, 255), min(class_id * 7, 255), min(class_id * 5, 255))
                    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2)
                    det_label = labels_map[class_id] if labels_map else str(class_id)
                    cv2.putText(frame, det_label + ' ' + str(round(obj[2] * 100, 1)) + ' %', (xmin, ymin - 7),
                                cv2.FONT_HERSHEY_COMPLEX, 0.6, color, 1)

            # Draw performance stats
            inf_time_message = "Inference time: N\A for async mode" if is_async_mode else \
                "Inference time: {:.3f} ms".format(det_time * 1000)
            render_time_message = "OpenCV rendering time: {:.3f} ms".format(render_time * 1000)
            async_mode_message = "Async mode is on. Processing request {}".format(cur_request_id) if is_async_mode else \
                "Async mode is off. Processing request {}".format(cur_request_id)

            cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
            cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)
            cv2.putText(frame, async_mode_message, (10, int(frame_h - 20)), cv2.FONT_HERSHEY_COMPLEX, 0.5,
                        (10, 10, 200), 1)

        presenter.drawGraphs(frame)
        render_start = time.time()
        if not args.no_show:
            cv2.imshow("Detection Results", frame)
        render_end = time.time()
        render_time = render_end - render_start

        if is_async_mode:
            cur_request_id, next_request_id = next_request_id, cur_request_id
            frame = next_frame
            frame_h, frame_w = frame.shape[:2]

        if not args.no_show:
            key = cv2.waitKey(1)
            if key == 27:
                break
            if (9 == key):
                is_async_mode = not is_async_mode
                log.info("Switched to {} mode".format("async" if is_async_mode else "sync"))
            else:
                presenter.handleKey(key)
    print(presenter.reportMeans())
Ejemplo n.º 11
0
def main():
    """ Main function. """

    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    place_recognition = PlaceRecognition(args.model, args.device,
                                         args.gallery_folder,
                                         args.cpu_extension, args.gallery_size)

    cap = open_images_capture(args.input, args.loop)

    compute_embeddings_times = []
    search_in_gallery_times = []

    frames_processed = 0
    presenter = monitors.Presenter(args.utilization_monitors, 0)
    video_writer = cv2.VideoWriter()

    while True:
        frame = cap.read()

        if frame is None:
            if frames_processed == 0:
                raise ValueError("Can't read an image from the input")
            break

        elapsed, probe_embedding = time_elapsed(
            place_recognition.compute_embedding, frame)
        compute_embeddings_times.append(elapsed)

        elapsed, (sorted_indexes, distances) = time_elapsed(
            place_recognition.search_in_gallery, probe_embedding)
        search_in_gallery_times.append(elapsed)

        image, key = visualize(
            frame, [str(place_recognition.impaths[i]) for i in sorted_indexes],
            distances[sorted_indexes],
            place_recognition.input_size,
            np.mean(compute_embeddings_times),
            np.mean(search_in_gallery_times),
            imshow_delay=3,
            presenter=presenter,
            no_show=args.no_show)

        if frames_processed == 0:
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
                (image.shape[1], image.shape[0])):
                raise RuntimeError("Can't open video writer")

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(image)

        if key == 27:
            break

    print(presenter.reportMeans())
Ejemplo n.º 12
0
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    # Plugin initialization for specified device and load extensions library if specified
    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    # Read IR
    log.info('Reading Proposal model {}'.format(args.model_pnet))
    p_net = core.read_model(args.model_pnet)
    if len(p_net.inputs) != 1:
        raise RuntimeError("Pnet supports only single input topologies")
    if len(p_net.outputs) != 2:
        raise RuntimeError("Pnet supports two output topologies")

    log.info('Reading Refine model {}'.format(args.model_rnet))
    r_net = core.read_model(args.model_rnet)
    if len(r_net.inputs) != 1:
        raise RuntimeError("Rnet supports only single input topologies")
    if len(r_net.outputs) != 2:
        raise RuntimeError("Rnet supports two output topologies")

    log.info('Reading Output model {}'.format(args.model_onet))
    o_net = core.read_model(args.model_onet)
    if len(o_net.inputs) != 1:
        raise RuntimeError("Onet supports only single input topologies")
    if len(o_net.outputs) != 3:
        raise RuntimeError("Onet supports three output topologies")

    pnet_input_tensor_name = p_net.inputs[0].get_any_name()
    rnet_input_tensor_name = r_net.inputs[0].get_any_name()
    onet_input_tensor_name = o_net.inputs[0].get_any_name()

    for node in p_net.outputs:
        if node.shape[1] == 2:
            pnet_cls_name = node.get_any_name()
        elif node.shape[1] == 4:
            pnet_roi_name = node.get_any_name()
        else:
            raise RuntimeError("Unsupported output layer for Pnet")

    for node in r_net.outputs:
        if node.shape[1] == 2:
            rnet_cls_name = node.get_any_name()
        elif node.shape[1] == 4:
            rnet_roi_name = node.get_any_name()
        else:
            raise RuntimeError("Unsupported output layer for Rnet")

    for node in o_net.outputs:
        if node.shape[1] == 2:
            onet_cls_name = node.get_any_name()
        elif node.shape[1] == 4:
            onet_roi_name = node.get_any_name()
        elif node.shape[1] == 10:
            onet_pts_name = node.get_any_name()
        else:
            raise RuntimeError("Unsupported output layer for Onet")

    next_frame_id = 0

    metrics = PerformanceMetrics()
    presenter = None
    video_writer = cv2.VideoWriter()
    is_loaded_before = False

    while True:
        start_time = perf_counter()
        origin_image = cap.read()
        if origin_image is None:
            if next_frame_id == 0:
                raise ValueError("Can't read an image from the input")
            break
        if next_frame_id == 0:
            presenter = monitors.Presenter(args.utilization_monitors, 55,
                                           (round(origin_image.shape[1] / 4),
                                            round(origin_image.shape[0] / 8)))
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
                (origin_image.shape[1], origin_image.shape[0])):
                raise RuntimeError("Can't open video writer")
        next_frame_id += 1

        rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB)
        oh, ow, _ = rgb_image.shape

        scales = utils.calculate_scales(rgb_image)

        # *************************************
        # Pnet stage
        # *************************************

        pnet_res = []
        for i, scale in enumerate(scales):
            hs = int(oh * scale)
            ws = int(ow * scale)
            image = preprocess_image(rgb_image, ws, hs)

            p_net.reshape({
                pnet_input_tensor_name: PartialShape([1, 3, ws, hs])
            })  # Change weidth and height of input blob
            compiled_pnet = core.compile_model(p_net, args.device)
            infer_request_pnet = compiled_pnet.create_infer_request()
            if i == 0 and not is_loaded_before:
                log.info("The Proposal model {} is loaded to {}".format(
                    args.model_pnet, args.device))

            infer_request_pnet.infer(inputs={pnet_input_tensor_name: image})
            p_res = {
                name: infer_request_pnet.get_tensor(name).data[:]
                for name in {pnet_roi_name, pnet_cls_name}
            }
            pnet_res.append(p_res)

        image_num = len(scales)
        rectangles = []
        for i in range(image_num):
            roi = pnet_res[i][pnet_roi_name]
            cls = pnet_res[i][pnet_cls_name]
            _, _, out_h, out_w = cls.shape
            out_side = max(out_h, out_w)
            rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side,
                                                1 / scales[i], ow, oh,
                                                score_threshold[0],
                                                iou_threshold[0])
            rectangles.extend(rectangle)
        rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou')

        # Rnet stage
        if len(rectangles) > 0:

            r_net.reshape({
                rnet_input_tensor_name:
                PartialShape([len(rectangles), 3, 24, 24])
            })  # Change batch size of input blob
            compiled_rnet = core.compile_model(r_net, args.device)
            infer_request_rnet = compiled_rnet.create_infer_request()
            if not is_loaded_before:
                log.info("The Refine model {} is loaded to {}".format(
                    args.model_rnet, args.device))

            rnet_input = []
            for rectangle in rectangles:
                crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]),
                                     int(rectangle[0]):int(rectangle[2])]
                crop_img = preprocess_image(crop_img, 24, 24)
                rnet_input.extend(crop_img)

            infer_request_rnet.infer(
                inputs={rnet_input_tensor_name: rnet_input})
            rnet_res = {
                name: infer_request_rnet.get_tensor(name).data[:]
                for name in {rnet_roi_name, rnet_cls_name}
            }

            roi = rnet_res[rnet_roi_name]
            cls = rnet_res[rnet_cls_name]
            rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh,
                                                 score_threshold[1],
                                                 iou_threshold[2])

        # Onet stage
        if len(rectangles) > 0:

            o_net.reshape({
                onet_input_tensor_name:
                PartialShape([len(rectangles), 3, 48, 48])
            })  # Change batch size of input blob
            compiled_onet = core.compile_model(o_net, args.device)
            infer_request_onet = compiled_onet.create_infer_request()
            if not is_loaded_before:
                log.info("The Output model {} is loaded to {}".format(
                    args.model_onet, args.device))
                is_loaded_before = True

            onet_input = []
            for rectangle in rectangles:
                crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]),
                                     int(rectangle[0]):int(rectangle[2])]
                crop_img = preprocess_image(crop_img, 48, 48)
                onet_input.extend(crop_img)

            infer_request_onet.infer(
                inputs={onet_input_tensor_name: onet_input})
            onet_res = {
                name: infer_request_onet.get_tensor(name).data[:]
                for name in {onet_roi_name, onet_cls_name, onet_pts_name}
            }

            roi = onet_res[onet_roi_name]
            cls = onet_res[onet_cls_name]
            pts = onet_res[onet_pts_name]
            rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow,
                                                 oh, score_threshold[2],
                                                 iou_threshold[3])

        # display results
        for rectangle in rectangles:
            # Draw detected boxes
            cv2.putText(origin_image,
                        'confidence: {:.2f}'.format(rectangle[4]),
                        (int(rectangle[0]), int(rectangle[1])),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0))
            cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])),
                          (int(rectangle[2]), int(rectangle[3])), (255, 0, 0),
                          1)
            # Draw landmarks
            for i in range(5, 15, 2):
                cv2.circle(origin_image,
                           (int(rectangle[i + 0]), int(rectangle[i + 1])), 2,
                           (0, 255, 0))

        metrics.update(start_time, origin_image)

        if video_writer.isOpened() and (args.output_limit <= 0
                                        or next_frame_id <= args.output_limit):
            video_writer.write(origin_image)

        if not args.no_show:
            cv2.imshow('MTCNN Results', origin_image)
            key = cv2.waitKey(1)
            if key in {ord('q'), ord('Q'), 27}:
                break
            presenter.handleKey(key)

    metrics.log_total()
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
    elif args.adapter == 'remote':
        log.info('Reading model {}'.format(args.model))
        serving_config = {"address": "localhost", "port": 9000}
        model_adapter = RemoteAdapter(args.model, serving_config)

    model = SegmentationModel.create_model(args.architecture_type,
                                           model_adapter,
                                           {'path_to_labels': args.labels})
    if args.architecture_type == 'segmentation':
        visualizer = SegmentationVisualizer(args.colors)
    if args.architecture_type == 'salient_object_detection':
        visualizer = SaliencyMapVisualizer()
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    next_frame_id = 0
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    video_writer = cv2.VideoWriter()
    only_masks = args.only_masks
    while True:
        if pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                output_transform = OutputTransform(frame.shape[:2],
                                                   args.output_resolution)
                if args.output_resolution:
                    output_resolution = output_transform.new_resolution
                else:
                    output_resolution = (frame.shape[1], frame.shape[0])
                presenter = monitors.Presenter(
                    args.utilization_monitors, 55,
                    (round(output_resolution[0] / 4),
                     round(output_resolution[1] / 8)))
                if args.output and not video_writer.open(
                        args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                        cap.fps(), output_resolution):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1
        else:
            # Wait for empty request
            pipeline.await_any()

        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        # Process all completed requests
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            objects, frame_meta = results
            if args.raw_output_message:
                print_raw_results(objects, next_frame_id_to_show, model.labels)
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']
            rendering_start_time = perf_counter()
            frame = render_segmentation(frame, objects, visualizer,
                                        output_transform, only_masks)
            render_metrics.update(rendering_start_time)
            presenter.drawGraphs(frame)
            metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow('Segmentation Results', frame)
                key = cv2.waitKey(1)
                if key == 27 or key == 'q' or key == 'Q':
                    break
                if key == 9:
                    only_masks = not only_masks
                presenter.handleKey(key)

    pipeline.await_all()
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = pipeline.get_result(next_frame_id_to_show)
        while results is None:
            results = pipeline.get_result(next_frame_id_to_show)
        objects, frame_meta = results
        if args.raw_output_message:
            print_raw_results(objects, next_frame_id_to_show, model.labels)
        frame = frame_meta['frame']
        start_time = frame_meta['start_time']

        rendering_start_time = perf_counter()
        frame = render_segmentation(frame, objects, visualizer,
                                    output_transform, only_masks)
        render_metrics.update(rendering_start_time)
        presenter.drawGraphs(frame)
        metrics.update(start_time, frame)

        if video_writer.isOpened() and (
                args.output_limit <= 0
                or next_frame_id_to_show <= args.output_limit - 1):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Segmentation Results', frame)
            key = cv2.waitKey(1)

    metrics.log_total()
    log_latency_per_stage(cap.reader_metrics.get_latency(),
                          pipeline.preprocess_metrics.get_latency(),
                          pipeline.inference_metrics.get_latency(),
                          pipeline.postprocess_metrics.get_latency(),
                          render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
    elif args.adapter == 'remote':
        log.info('Reading model {}'.format(args.model))
        serving_config = {"address": "localhost", "port": 9000}
        model_adapter = RemoteAdapter(args.model, serving_config)

    model = MonoDepthModel(model_adapter)
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    next_frame_id = 0
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    video_writer = cv2.VideoWriter()

    while True:
        if pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                output_transform = OutputTransform(frame.shape[:2],
                                                   args.output_resolution)
                if args.output_resolution:
                    output_resolution = output_transform.new_resolution
                else:
                    output_resolution = (frame.shape[1], frame.shape[0])
                presenter = monitors.Presenter(
                    args.utilization_monitors, 55,
                    (round(output_resolution[0] / 4),
                     round(output_resolution[1] / 8)))
                if args.output and not video_writer.open(
                        args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                        cap.fps(), output_resolution):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            pipeline.submit_data(frame, next_frame_id,
                                 {'start_time': start_time})
            next_frame_id += 1
        else:
            # Wait for empty request
            pipeline.await_any()

        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        # Process all completed requests
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            depth_map, frame_meta = results
            depth_map = apply_color_map(depth_map, output_transform)

            start_time = frame_meta['start_time']
            presenter.drawGraphs(depth_map)
            metrics.update(start_time, depth_map)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(depth_map)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow(DEMO_NAME, depth_map)
                key = cv2.waitKey(1)
                if key == 27 or key == 'q' or key == 'Q':
                    break
                presenter.handleKey(key)

    pipeline.await_all()
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = pipeline.get_result(next_frame_id_to_show)
        while results is None:
            results = pipeline.get_result(next_frame_id_to_show)
        depth_map, frame_meta = results
        depth_map = apply_color_map(depth_map, output_transform)

        start_time = frame_meta['start_time']

        presenter.drawGraphs(depth_map)
        metrics.update(start_time, depth_map)

        if video_writer.isOpened() and (
                args.output_limit <= 0
                or next_frame_id_to_show <= args.output_limit - 1):
            video_writer.write(depth_map)

        if not args.no_show:
            cv2.imshow(DEMO_NAME, depth_map)
            key = cv2.waitKey(1)
            if key == 27 or key == 'q' or key == 'Q':
                break
            presenter.handleKey(key)

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
Ejemplo n.º 15
0
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)
    delay = int(cap.get_type() in {'VIDEO', 'CAMERA'})

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    config = {
        'mean_values': args.mean_values,
        'scale_values': args.scale_values,
        'reverse_input_channels': args.reverse_input_channels,
        'topk': args.topk,
        'path_to_labels': args.labels
    }
    model = Classification(model_adapter, config)
    model.log_layers_info()

    async_pipeline = AsyncPipeline(model)

    next_frame_id = 0
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    presenter = None
    output_transform = None
    video_writer = cv2.VideoWriter()
    ESC_KEY = 27
    key = -1
    while True:
        if async_pipeline.callback_exceptions:
            raise async_pipeline.callback_exceptions[0]
        # Process all completed requests
        results = async_pipeline.get_result(next_frame_id_to_show)
        if results:
            classifications, frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']
            if args.raw_output_message:
                print_raw_results(classifications, next_frame_id_to_show)

            presenter.drawGraphs(frame)
            rendering_start_time = perf_counter()
            frame = draw_labels(frame, classifications, output_transform)
            if delay or args.no_show:
                render_metrics.update(rendering_start_time)
                metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            next_frame_id_to_show += 1

            if not args.no_show:
                cv2.imshow('Classification Results', frame)
                key = cv2.waitKey(delay)
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)
            continue

        if async_pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                output_transform = OutputTransform(frame.shape[:2],
                                                   args.output_resolution)
                if args.output_resolution:
                    output_resolution = output_transform.new_resolution
                else:
                    output_resolution = (frame.shape[1], frame.shape[0])
                presenter = monitors.Presenter(
                    args.utilization_monitors, 55,
                    (round(output_resolution[0] / 4),
                     round(output_resolution[1] / 8)))
                if args.output and not video_writer.open(
                        args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                        cap.fps(), output_resolution):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            async_pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1

        else:
            # Wait for empty request
            async_pipeline.await_any()

    async_pipeline.await_all()
    if key not in {ord('q'), ord('Q'), ESC_KEY}:
        # Process completed requests
        for next_frame_id_to_show in range(next_frame_id_to_show,
                                           next_frame_id):
            results = async_pipeline.get_result(next_frame_id_to_show)
            while results is None:
                results = async_pipeline.get_result(next_frame_id_to_show)
            classifications, frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if args.raw_output_message:
                print_raw_results(classifications, next_frame_id_to_show)

            presenter.drawGraphs(frame)
            rendering_start_time = perf_counter()
            frame = draw_labels(frame, classifications, output_transform)
            if delay or args.no_show:
                render_metrics.update(rendering_start_time)
                metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)

            if not args.no_show:
                cv2.imshow('Classification Results', frame)
                key = cv2.waitKey(delay)

                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)

    if delay or args.no_show:
        metrics.log_total()
        log_latency_per_stage(cap.reader_metrics.get_latency(),
                              async_pipeline.preprocess_metrics.get_latency(),
                              async_pipeline.inference_metrics.get_latency(),
                              async_pipeline.postprocess_metrics.get_latency(),
                              render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    """ Main function. """

    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    img_retrieval = ImageRetrieval(args.model, args.device, args.gallery,
                                   INPUT_SIZE, args.cpu_extension)

    frames = RoiDetectorOnVideo(args.i)

    compute_embeddings_times = []
    search_in_gallery_times = []

    positions = []

    presenter = monitors.Presenter(args.utilization_monitors, 0)

    for image, view_frame in frames:
        position = None
        sorted_indexes = []

        if image is not None:
            image = central_crop(image, divide_by=5, shift=1)

            elapsed, probe_embedding = time_elapsed(
                img_retrieval.compute_embedding, image)
            compute_embeddings_times.append(elapsed)

            elapsed, (sorted_indexes, distances) = time_elapsed(
                img_retrieval.search_in_gallery, probe_embedding)
            search_in_gallery_times.append(elapsed)

            sorted_classes = [
                img_retrieval.gallery_classes[i] for i in sorted_indexes
            ]

            if args.ground_truth is not None:
                position = sorted_classes.index(
                    img_retrieval.text_label_to_class_id[args.ground_truth])
                positions.append(position)
                log.info("ROI detected, found: %d, position of target: %d",
                         sorted_classes[0], position)
            else:
                log.info("ROI detected, found: %s", sorted_classes[0])

        key = visualize(
            view_frame,
            position, [img_retrieval.impaths[i] for i in sorted_indexes],
            distances[sorted_indexes] if position is not None else None,
            img_retrieval.input_size,
            np.mean(compute_embeddings_times),
            np.mean(search_in_gallery_times),
            imshow_delay=3,
            presenter=presenter,
            no_show=args.no_show)

        if key == 27:
            break
    print(presenter.reportMeans())

    if positions:
        compute_metrics(positions)
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)
    next_frame_id = 1
    next_frame_id_to_show = 0

    metrics = PerformanceMetrics()
    render_metrics = PerformanceMetrics()
    video_writer = cv2.VideoWriter()

    plugin_config = get_user_config(args.device, args.num_streams,
                                    args.num_threads)
    model_adapter = OpenvinoAdapter(create_core(),
                                    args.model,
                                    device=args.device,
                                    plugin_config=plugin_config,
                                    max_num_requests=args.num_infer_requests)

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    config = {
        'target_size': args.tsize,
        'aspect_ratio': frame.shape[1] / frame.shape[0],
        'prob_threshold': args.prob_threshold,
        'padding_mode': 'center' if args.architecture_type == 'higherhrnet'
        else None,  # the 'higherhrnet' and 'ae' specific
        'delta':
        0.5 if 'higherhrnet' else None,  # the 'higherhrnet' and 'ae' specific
    }
    model = ImageModel.create_model(ARCHITECTURES[args.architecture_type],
                                    model_adapter, config)
    model.log_layers_info()

    hpe_pipeline = AsyncPipeline(model)
    hpe_pipeline.submit_data(frame, 0, {
        'frame': frame,
        'start_time': start_time
    })

    output_transform = OutputTransform(frame.shape[:2], args.output_resolution)
    if args.output_resolution:
        output_resolution = output_transform.new_resolution
    else:
        output_resolution = (frame.shape[1], frame.shape[0])
    presenter = monitors.Presenter(
        args.utilization_monitors, 55,
        (round(output_resolution[0] / 4), round(output_resolution[1] / 8)))
    if args.output and not video_writer.open(args.output,
                                             cv2.VideoWriter_fourcc(*'MJPG'),
                                             cap.fps(), output_resolution):
        raise RuntimeError("Can't open video writer")

    while True:
        if hpe_pipeline.callback_exceptions:
            raise hpe_pipeline.callback_exceptions[0]
        # Process all completed requests
        results = hpe_pipeline.get_result(next_frame_id_to_show)
        if results:
            (poses, scores), frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if len(poses) and args.raw_output_message:
                print_raw_results(poses, scores, next_frame_id_to_show)

            presenter.drawGraphs(frame)
            rendering_start_time = perf_counter()
            frame = draw_poses(frame, poses, args.prob_threshold,
                               output_transform)
            render_metrics.update(rendering_start_time)
            metrics.update(start_time, frame)
            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            next_frame_id_to_show += 1
            if not args.no_show:
                cv2.imshow('Pose estimation results', frame)
                key = cv2.waitKey(1)

                ESC_KEY = 27
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)
            continue

        if hpe_pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                break

            # Submit for inference
            hpe_pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1

        else:
            # Wait for empty request
            hpe_pipeline.await_any()

    hpe_pipeline.await_all()
    # Process completed requests
    for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id):
        results = hpe_pipeline.get_result(next_frame_id_to_show)
        while results is None:
            results = hpe_pipeline.get_result(next_frame_id_to_show)
        (poses, scores), frame_meta = results
        frame = frame_meta['frame']
        start_time = frame_meta['start_time']

        if len(poses) and args.raw_output_message:
            print_raw_results(poses, scores, next_frame_id_to_show)

        presenter.drawGraphs(frame)
        rendering_start_time = perf_counter()
        frame = draw_poses(frame, poses, args.prob_threshold, output_transform)
        render_metrics.update(rendering_start_time)
        metrics.update(start_time, frame)
        if video_writer.isOpened() and (
                args.output_limit <= 0
                or next_frame_id_to_show <= args.output_limit - 1):
            video_writer.write(frame)
        if not args.no_show:
            cv2.imshow('Pose estimation results', frame)
            key = cv2.waitKey(1)

            ESC_KEY = 27
            # Quit.
            if key in {ord('q'), ord('Q'), ESC_KEY}:
                break
            presenter.handleKey(key)

    metrics.log_total()
    log_latency_per_stage(cap.reader_metrics.get_latency(),
                          hpe_pipeline.preprocess_metrics.get_latency(),
                          hpe_pipeline.inference_metrics.get_latency(),
                          hpe_pipeline.postprocess_metrics.get_latency(),
                          render_metrics.get_latency())
    for rep in presenter.reportMeans():
        log.info(rep)
def main():
    parser = argparse.ArgumentParser(description='Whiteboard inpainting demo')
    parser.add_argument('-i', type=str, help='Input sources (index of camera \
                        or path to a video file)', required=True)
    parser.add_argument('-m_i', '--m_instance_segmentation', type=str, required=False,
                        help='Path to the instance segmentation model')
    parser.add_argument('-m_s', '--m_semantic_segmentation', type=str, required=False,
                        help='Path to the semantic segmentation model')
    parser.add_argument('-t', '--threshold', type=float, default=0.6,
                        help='Threshold for person instance segmentation model')
    parser.add_argument('--output_video', type=str, default='', required=False,
                        help='Optional. Path to output video')
    parser.add_argument("--no_show", help="Optional. Don't show output", action='store_true')

    parser.add_argument('-d', '--device', type=str, default='CPU',
                        help='Optional. Specify a target device to infer on. CPU, GPU, FPGA, HDDL or MYRIAD is '
                             'acceptable. The demo will look for a suitable plugin for the device specified')
    parser.add_argument('-l', '--cpu_extension', type=str, default=None,
                        help='MKLDNN (CPU)-targeted custom layers.Absolute \
                              path to a shared library with the kernels impl.')
    parser.add_argument('-u', '--utilization_monitors', default='', type=str,
                        help='Optional. List of monitors to show initially')
    args = parser.parse_args()

    capture = VideoCapture(args.i)

    if bool(args.m_instance_segmentation) == bool(args.m_semantic_segmentation):
        raise ValueError('Set up exactly one of segmentation models: '\
                         '--m_instance_segmentation or --m_semantic_segmentation')

    frame_size, fps = capture.get_source_parameters()
    out_frame_size = (int(frame_size[0]), int(frame_size[1] * 2))
    presenter = monitors.Presenter(args.utilization_monitors, 20,
                                   (out_frame_size[0] // 4, out_frame_size[1] // 16))

    root_dir = osp.dirname(osp.abspath(__file__))

    mouse = MouseClick()
    if not args.no_show:
        cv2.namedWindow(WINNAME)
        cv2.setMouseCallback(WINNAME, mouse.get_points)

    if args.output_video:
        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        output_video = cv2.VideoWriter(args.output_video, fourcc, fps, out_frame_size)
    else:
        output_video = None

    log.info("Initializing Inference Engine")
    ie = IECore()
    if args.m_instance_segmentation:
        labels_file = osp.join(root_dir, 'coco_labels.txt')
        segmentation = MaskRCNN(ie, args.m_instance_segmentation, labels_file,
                                args.threshold, args.device, args.cpu_extension)
    elif args.m_semantic_segmentation:
        labels_file = osp.join(root_dir, 'cityscapes_labels.txt')
        segmentation = SemanticSegmentation(ie, args.m_semantic_segmentation, labels_file,
                                            args.threshold, args.device, args.cpu_extension)

    black_board = False
    output_frame = np.full((frame_size[1], frame_size[0], 3), 255, dtype='uint8')
    frame_number = 0
    key = -1

    while True:
        start = time.time()
        _, frame = capture.get_frame()

        mask = None
        if frame is not None:
            detections = segmentation.get_detections([frame])
            expand_mask(detections, frame_size[0] // 27)
            if len(detections[0]) > 0:
                mask = detections[0][0][2]
                for i in range(1, len(detections[0])):
                    mask = cv2.bitwise_or(mask, detections[0][i][2])
        else:
            break

        if mask is not None:
            mask = np.stack([mask, mask, mask], axis=-1)
        else:
            mask = np.zeros(frame.shape, dtype='uint8')

        clear_frame = remove_background(frame, invert_colors=not black_board)

        output_frame = np.where(mask, output_frame, clear_frame)
        merged_frame = np.vstack([frame, output_frame])
        merged_frame = cv2.resize(merged_frame, out_frame_size)

        if output_video is not None:
            output_video.write(merged_frame)

        presenter.drawGraphs(merged_frame)
        if not args.no_show:
            cv2.imshow(WINNAME, merged_frame)
            key = check_pressed_keys(key)
            if key == 27:  # 'Esc'
                break
            if key == ord('i'):  # catch pressing of key 'i'
                black_board = not black_board
                output_frame = 255 - output_frame
            else:
                presenter.handleKey(key)

        if mouse.crop_available:
            x0, x1 = min(mouse.points[0][0], mouse.points[1][0]), \
                     max(mouse.points[0][0], mouse.points[1][0])
            y0, y1 = min(mouse.points[0][1], mouse.points[1][1]), \
                     max(mouse.points[0][1], mouse.points[1][1])
            x1, y1 = min(x1, output_frame.shape[1] - 1), min(y1, output_frame.shape[0] - 1)
            board = output_frame[y0: y1, x0: x1, :]
            if board.shape[0] > 0 and board.shape[1] > 0:
                cv2.namedWindow('Board', cv2.WINDOW_KEEPRATIO)
                cv2.imshow('Board', board)

        end = time.time()
        print('\rProcessing frame: {}, fps = {:.3}' \
            .format(frame_number, 1. / (end - start)), end="")
        frame_number += 1
    print('')

    log.info(presenter.reportMeans())

    if output_video is not None:
        output_video.release()
Ejemplo n.º 19
0
def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('Creating Inference Engine...')
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')
    # Read IR
    log.info('Loading network')
    net = ie.read_network(args.model, os.path.splitext(args.model)[0] + '.bin')
    image_input, image_info_input, (n, c, h, w), postprocessor = check_model(net)

    log.info('Loading IR to the plugin...')
    exec_net = ie.load_network(network=net, device_name=args.device, num_requests=2)

    try:
        input_source = int(args.input_source)
    except ValueError:
        input_source = args.input_source
    cap = cv2.VideoCapture(input_source)
    if not cap.isOpened():
        log.error('Failed to open "{}"'.format(args.input_source))
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    with open(args.labels, 'rt') as labels_file:
        class_labels = labels_file.read().splitlines()

    presenter = monitors.Presenter(args.utilization_monitors, 45,
        (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8)))
    visualizer = Visualizer(class_labels, show_boxes=args.show_boxes, show_scores=args.show_scores)

    render_time = 0

    log.info('Starting inference...')
    print("To close the application, press 'CTRL+C' here or switch to the output window and press ESC key")
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if args.no_keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image, ((0, h - input_image_size[0]),
                                           (0, w - input_image_size[1]),
                                           (0, 0)),
                             mode='constant', constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)
        input_image_info = np.asarray([[input_image_size[0], input_image_size[1], 1]], dtype=np.float32)

        # Run the net.
        inf_start = time.time()
        feed_dict = {image_input: input_image}
        if image_info_input:
            feed_dict[image_info_input] = input_image_info
        outputs = exec_net.infer(feed_dict)
        inf_end = time.time()
        det_time = inf_end - inf_start

        # Parse detection results of the current request
        scores, classes, boxes, masks = postprocessor(
            outputs, scale_x, scale_y, *frame.shape[:2], h, w,  args.prob_threshold)

        render_start = time.time()

        if len(boxes) and args.raw_output_message:
            log.info('Detected boxes:')
            log.info('  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX ')
            for box, cls, score, mask in zip(boxes, classes, scores, masks):
                log.info('{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '.format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, presenter, masks, masks_tracks_ids)

        # Draw performance stats.
        inf_time_message = 'Inference time: {:.3f} ms'.format(det_time * 1000)
        render_time_message = 'OpenCV rendering time: {:.3f} ms'.format(render_time * 1000)
        cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
        cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)

        # Print performance counters.
        if args.perf_counts:
            perf_counts = exec_net.requests[0].get_perf_counts()
            log.info('Performance counters:')
            print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format('name', 'layer_type', 'exet_type', 'status',
                                                              'real_time, us'))
            for layer, stats in perf_counts.items():
                print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(layer, stats['layer_type'], stats['exec_type'],
                                                                  stats['status'], stats['real_time']))

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)
        render_end = time.time()
        render_time = render_end - render_start

        if not args.no_show:
            key = cv2.waitKey(args.delay)
            esc_code = 27
            if key == esc_code:
                break
            presenter.handleKey(key)

    print(presenter.reportMeans())
    cv2.destroyAllWindows()
    cap.release()
def main():
    args = build_argparser().parse_args()

    # ------------- 1. Plugin initialization for specified device and load extensions library if specified -------------
    log.info("Creating Inference Engine...")
    ie = IECore()

    config_user_specified = {}
    config_min_latency = {}

    devices_nstreams = {}
    if args.num_streams:
        devices_nstreams = {device: args.num_streams for device in ['CPU', 'GPU'] if device in args.device} \
                           if args.num_streams.isdigit() \
                           else dict([device.split(':') for device in args.num_streams.split(',')])

    if 'CPU' in args.device:
        if args.cpu_extension:
            ie.add_extension(args.cpu_extension, 'CPU')
        if args.number_threads is not None:
            config_user_specified['CPU_THREADS_NUM'] = str(args.number_threads)
        if 'CPU' in devices_nstreams:
            config_user_specified['CPU_THROUGHPUT_STREAMS'] = devices_nstreams['CPU'] \
                                                              if int(devices_nstreams['CPU']) > 0 \
                                                              else 'CPU_THROUGHPUT_AUTO'

        config_min_latency['CPU_THROUGHPUT_STREAMS'] = '1'

    if 'GPU' in args.device:
        if 'GPU' in devices_nstreams:
            config_user_specified['GPU_THROUGHPUT_STREAMS'] = devices_nstreams['GPU'] \
                                                              if int(devices_nstreams['GPU']) > 0 \
                                                              else 'GPU_THROUGHPUT_AUTO'

        config_min_latency['GPU_THROUGHPUT_STREAMS'] = '1'

    # -------------------- 2. Reading the IR generated by the Model Optimizer (.xml and .bin files) --------------------
    log.info("Loading network")
    net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin")

    # ---------------------------------- 3. Load CPU extension for support specific layer ------------------------------
    if "CPU" in args.device:
        supported_layers = ie.query_network(net, "CPU")
        not_supported_layers = [
            l for l in net.layers.keys() if l not in supported_layers
        ]
        if len(not_supported_layers) != 0:
            log.error(
                "Following layers are not supported by the plugin for specified device {}:\n {}"
                .format(args.device, ', '.join(not_supported_layers)))
            log.error(
                "Please try to specify cpu extensions library path in sample's command line parameters using -l "
                "or --cpu_extension command line argument")
            sys.exit(1)

    assert len(
        net.input_info
    ) == 1, "Sample supports only YOLO V3 based single input topologies"

    # ---------------------------------------------- 4. Preparing inputs -----------------------------------------------
    log.info("Preparing inputs")
    input_blob = next(iter(net.input_info))

    # Read and pre-process input images
    if net.input_info[input_blob].input_data.shape[1] == 3:
        input_height, input_width = net.input_info[
            input_blob].input_data.shape[2:]
        nchw_shape = True
    else:
        input_height, input_width = net.input_info[
            input_blob].input_data.shape[1:3]
        nchw_shape = False

    if args.labels:
        with open(args.labels, 'r') as f:
            labels_map = [x.strip() for x in f]
    else:
        labels_map = None

    input_stream = 0 if args.input == "cam" else args.input

    mode = Mode(Modes.USER_SPECIFIED)
    cap = cv2.VideoCapture(input_stream)
    wait_key_time = 1

    # ----------------------------------------- 5. Loading model to the plugin -----------------------------------------
    log.info("Loading model to the plugin")
    exec_nets = {}

    exec_nets[Modes.USER_SPECIFIED] = ie.load_network(
        network=net,
        device_name=args.device,
        config=config_user_specified,
        num_requests=args.num_infer_requests)
    exec_nets[Modes.MIN_LATENCY] = ie.load_network(
        network=net,
        device_name=args.device.split(":")[-1].split(",")[0],
        config=config_min_latency,
        num_requests=1)

    empty_requests = deque(exec_nets[mode.current].requests)
    completed_request_results = {}
    next_frame_id = 0
    next_frame_id_to_show = 0
    mode_info = {mode.current: ModeInfo()}
    event = threading.Event()
    callback_exceptions = []

    # ----------------------------------------------- 6. Doing inference -----------------------------------------------
    log.info("Starting inference...")
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    print(
        "To switch between min_latency/user_specified modes, press TAB key in the output window"
    )

    presenter = monitors.Presenter(
        args.utilization_monitors, 55,
        (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4),
         round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8)))

    while (cap.isOpened() \
           or completed_request_results \
           or len(empty_requests) < len(exec_nets[mode.current].requests)) \
          and not callback_exceptions:
        if next_frame_id_to_show in completed_request_results:
            frame, output, start_time, is_same_mode = completed_request_results.pop(
                next_frame_id_to_show)

            next_frame_id_to_show += 1
            if is_same_mode:
                mode_info[mode.current].frames_count += 1

            objects = get_objects(output, net, (input_height, input_width),
                                  frame.shape[:-1], args.prob_threshold,
                                  args.keep_aspect_ratio)
            objects = filter_objects(objects, args.iou_threshold,
                                     args.prob_threshold)

            if len(objects) and args.raw_output_message:
                log.info(
                    " Class ID | Confidence | XMIN | YMIN | XMAX | YMAX | COLOR "
                )

            origin_im_size = frame.shape[:-1]
            presenter.drawGraphs(frame)
            for obj in objects:
                # Validation bbox of detected object
                obj['xmax'] = min(obj['xmax'], origin_im_size[1])
                obj['ymax'] = min(obj['ymax'], origin_im_size[0])
                obj['xmin'] = max(obj['xmin'], 0)
                obj['ymin'] = max(obj['ymin'], 0)
                color = (min(obj['class_id'] * 12.5,
                             255), min(obj['class_id'] * 7,
                                       255), min(obj['class_id'] * 5, 255))
                det_label = labels_map[obj['class_id']] if labels_map and len(labels_map) >= obj['class_id'] else \
                    str(obj['class_id'])

                if args.raw_output_message:
                    log.info(
                        "{:^9} | {:10f} | {:4} | {:4} | {:4} | {:4} | {} ".
                        format(det_label, obj['confidence'], obj['xmin'],
                               obj['ymin'], obj['xmax'], obj['ymax'], color))

                cv2.rectangle(frame, (obj['xmin'], obj['ymin']),
                              (obj['xmax'], obj['ymax']), color, 2)
                cv2.putText(
                    frame, "#" + det_label + ' ' +
                    str(round(obj['confidence'] * 100, 1)) + ' %',
                    (obj['xmin'], obj['ymin'] - 7), cv2.FONT_HERSHEY_COMPLEX,
                    0.6, color, 1)

            # Draw performance stats over frame
            if mode_info[mode.current].frames_count != 0:
                fps_message = "FPS: {:.1f}".format(mode_info[mode.current].frames_count / \
                                                   (perf_counter() - mode_info[mode.current].last_start_time))
                mode_info[
                    mode.current].latency_sum += perf_counter() - start_time
                latency_message = "Latency: {:.1f} ms".format((mode_info[mode.current].latency_sum / \
                                                              mode_info[mode.current].frames_count) * 1e3)

                put_highlighted_text(frame, fps_message, (15, 20),
                                     cv2.FONT_HERSHEY_COMPLEX, 0.75,
                                     (200, 10, 10), 2)
                put_highlighted_text(frame, latency_message, (15, 50),
                                     cv2.FONT_HERSHEY_COMPLEX, 0.75,
                                     (200, 10, 10), 2)

            mode_message = "{} mode".format(mode.current.name)
            put_highlighted_text(frame, mode_message,
                                 (10, int(origin_im_size[0] - 20)),
                                 cv2.FONT_HERSHEY_COMPLEX, 0.75, (10, 10, 200),
                                 2)

            if not args.no_show:
                cv2.imshow("Detection Results", frame)
                #print(frame.shape)
                cv2.imwrite("prob_threshold-0.2.png", frame)
                key = cv2.waitKey(wait_key_time)

                if key in {ord("q"), ord("Q"), 27}:  # ESC key
                    break
                if key == 9:  # Tab key
                    prev_mode = mode.current
                    mode.next()

                    await_requests_completion(exec_nets[prev_mode].requests)
                    empty_requests.clear()
                    empty_requests.extend(exec_nets[mode.current].requests)

                    mode_info[prev_mode].last_end_time = perf_counter()
                    mode_info[mode.current] = ModeInfo()
                else:
                    presenter.handleKey(key)

        elif empty_requests and cap.isOpened():
            start_time = perf_counter()
            ret, frame = cap.read()
            if not ret:
                if args.loop_input:
                    cap.open(input_stream)
                else:
                    cap.release()
                continue

            request = empty_requests.popleft()

            # resize input_frame to network size
            in_frame = preprocess_frame(frame, input_height, input_width,
                                        nchw_shape, args.keep_aspect_ratio)

            # Start inference
            request.set_completion_callback(
                py_callback=async_callback,
                py_data=(request, next_frame_id, mode.current, frame,
                         start_time, completed_request_results, empty_requests,
                         mode, event, callback_exceptions))
            request.async_infer(inputs={input_blob: in_frame})
            next_frame_id += 1

        else:
            event.wait()

    if callback_exceptions:
        raise callback_exceptions[0]

    for mode_value in mode_info.keys():
        log.info("")
        log.info("Mode: {}".format(mode_value.name))

        end_time = mode_info[mode_value].last_end_time if mode_value in mode_info \
                                                          and mode_info[mode_value].last_end_time is not None \
                                                       else perf_counter()
        log.info("FPS: {:.1f}".format(mode_info[mode_value].frames_count / \
                                      (end_time - mode_info[mode_value].last_start_time)))
        log.info("Latency: {:.1f} ms".format((mode_info[mode_value].latency_sum / \
                                             mode_info[mode_value].frames_count) * 1e3))
    print(presenter.reportMeans())

    for exec_net in exec_nets.values():
        await_requests_completion(exec_net.requests)
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    place_recognition = PlaceRecognition(args.model, args.device,
                                         args.gallery_folder,
                                         args.gallery_size)

    compute_embeddings_times = []
    search_in_gallery_times = []

    frames_processed = 0
    presenter = monitors.Presenter(args.utilization_monitors, 0)
    video_writer = cv2.VideoWriter()
    metrics = PerformanceMetrics()

    while True:
        start_time = perf_counter()
        frame = cap.read()

        if frame is None:
            if frames_processed == 0:
                raise ValueError("Can't read an image from the input")
            break

        elapsed, probe_embedding = time_elapsed(
            place_recognition.compute_embedding, frame)
        compute_embeddings_times.append(elapsed)

        elapsed, (sorted_indexes, distances) = time_elapsed(
            place_recognition.search_in_gallery, probe_embedding)
        search_in_gallery_times.append(elapsed)

        image, key = visualize(
            frame, [str(place_recognition.impaths[i]) for i in sorted_indexes],
            distances[sorted_indexes],
            place_recognition.input_size,
            np.mean(compute_embeddings_times),
            np.mean(search_in_gallery_times),
            imshow_delay=3,
            presenter=presenter,
            no_show=args.no_show)

        metrics.update(start_time)
        if frames_processed == 0:
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
                (image.shape[1], image.shape[0])):
                raise RuntimeError("Can't open video writer")

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(image)

        if key == 27:
            break

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
    video_writer = cv2.VideoWriter()
    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    base_height = args.height_size
    fx = args.fx

    frames_processed = 0
    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    presenter = monitors.Presenter(args.utilization_monitors, 0)

    while frame is not None:
        current_time = cv2.getTickCount()
        input_scale = base_height / frame.shape[0]
        scaled_img = cv2.resize(frame,
                                dsize=None,
                                fx=input_scale,
                                fy=input_scale)
        if fx < 0:  # Focal length is unknown
            fx = np.float32(0.8 * frame.shape[1])

        inference_result = inference_engine.infer(scaled_img)
        poses_3d, poses_2d = parse_poses(inference_result, input_scale, stride,
                                         fx, is_video)
        edges = []
def main():
    metrics = PerformanceMetrics()

    args = build_argparser().parse_args()

    # Plugin initialization for specified device and load extensions library if specified
    log.info("Creating Inference Engine")

    ie = IECore()

    # Read IR
    log.info("Loading network files:\n\t{}".format(args.model_pnet))
    p_net = ie.read_network(args.model_pnet)
    assert len(p_net.input_info.keys()
               ) == 1, "Pnet supports only single input topologies"
    assert len(p_net.outputs) == 2, "Pnet supports two output topologies"

    log.info("Loading network files:\n\t{}".format(args.model_rnet))
    r_net = ie.read_network(args.model_rnet)
    assert len(r_net.input_info.keys()
               ) == 1, "Rnet supports only single input topologies"
    assert len(r_net.outputs) == 2, "Rnet supports two output topologies"

    log.info("Loading network files:\n\t{}".format(args.model_onet))
    o_net = ie.read_network(args.model_onet)
    assert len(o_net.input_info.keys()
               ) == 1, "Onet supports only single input topologies"
    assert len(o_net.outputs) == 3, "Onet supports three output topologies"

    log.info("Preparing input blobs")
    pnet_input_blob = next(iter(p_net.input_info))
    rnet_input_blob = next(iter(r_net.input_info))
    onet_input_blob = next(iter(o_net.input_info))

    log.info("Preparing output blobs")
    for name, blob in p_net.outputs.items():
        if blob.shape[1] == 2:
            pnet_cls_name = name
        elif blob.shape[1] == 4:
            pnet_roi_name = name
        else:
            raise RuntimeError("Unsupported output layer for Pnet")

    for name, blob in r_net.outputs.items():
        if blob.shape[1] == 2:
            rnet_cls_name = name
        elif blob.shape[1] == 4:
            rnet_roi_name = name
        else:
            raise RuntimeError("Unsupported output layer for Rnet")

    for name, blob in o_net.outputs.items():
        if blob.shape[1] == 2:
            onet_cls_name = name
        elif blob.shape[1] == 4:
            onet_roi_name = name
        elif blob.shape[1] == 10:
            onet_pts_name = name
        else:
            raise RuntimeError("Unsupported output layer for Onet")

    cap = open_images_capture(args.input, args.loop)

    next_frame_id = 0

    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )

    presenter = None
    video_writer = cv2.VideoWriter()

    while True:
        start_time = perf_counter()
        origin_image = cap.read()
        if origin_image is None:
            if next_frame_id == 0:
                raise ValueError("Can't read an image from the input")
            break
        if next_frame_id == 0:
            presenter = monitors.Presenter(args.utilization_monitors, 55,
                                           (round(origin_image.shape[1] / 4),
                                            round(origin_image.shape[0] / 8)))
            if args.output and not video_writer.open(
                    args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
                (origin_image.shape[1], origin_image.shape[0])):
                raise RuntimeError("Can't open video writer")
        next_frame_id += 1

        rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB)
        oh, ow, _ = rgb_image.shape

        scales = utils.calculate_scales(rgb_image)

        # *************************************
        # Pnet stage
        # *************************************
        log.info("Loading Pnet model to the plugin")

        t0 = cv2.getTickCount()
        pnet_res = []
        for scale in scales:
            hs = int(oh * scale)
            ws = int(ow * scale)
            image = preprocess_image(rgb_image, ws, hs)

            p_net.reshape({pnet_input_blob:
                           [1, 3, ws,
                            hs]})  # Change weidth and height of input blob
            exec_pnet = ie.load_network(network=p_net, device_name=args.device)

            p_res = exec_pnet.infer(inputs={pnet_input_blob: image})
            pnet_res.append(p_res)

        image_num = len(scales)
        rectangles = []
        for i in range(image_num):
            roi = pnet_res[i][pnet_roi_name]
            cls = pnet_res[i][pnet_cls_name]
            _, _, out_h, out_w = cls.shape
            out_side = max(out_h, out_w)
            rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side,
                                                1 / scales[i], ow, oh,
                                                score_threshold[0],
                                                iou_threshold[0])
            rectangles.extend(rectangle)
        rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou')

        # Rnet stage
        if len(rectangles) > 0:
            log.info("Loading Rnet model to the plugin")

            r_net.reshape({rnet_input_blob:
                           [len(rectangles), 3, 24,
                            24]})  # Change batch size of input blob
            exec_rnet = ie.load_network(network=r_net, device_name=args.device)

            rnet_input = []
            for rectangle in rectangles:
                crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]),
                                     int(rectangle[0]):int(rectangle[2])]
                crop_img = preprocess_image(crop_img, 24, 24)
                rnet_input.extend(crop_img)

            rnet_res = exec_rnet.infer(inputs={rnet_input_blob: rnet_input})

            roi = rnet_res[rnet_roi_name]
            cls = rnet_res[rnet_cls_name]
            rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh,
                                                 score_threshold[1],
                                                 iou_threshold[2])

        # Onet stage
        if len(rectangles) > 0:
            log.info("Loading Onet model to the plugin")

            o_net.reshape({onet_input_blob:
                           [len(rectangles), 3, 48,
                            48]})  # Change batch size of input blob
            exec_onet = ie.load_network(network=o_net, device_name=args.device)

            onet_input = []
            for rectangle in rectangles:
                crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]),
                                     int(rectangle[0]):int(rectangle[2])]
                crop_img = preprocess_image(crop_img, 48, 48)
                onet_input.extend(crop_img)

            onet_res = exec_onet.infer(inputs={onet_input_blob: onet_input})

            roi = onet_res[onet_roi_name]
            cls = onet_res[onet_cls_name]
            pts = onet_res[onet_pts_name]
            rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow,
                                                 oh, score_threshold[2],
                                                 iou_threshold[3])

        # display results
        for rectangle in rectangles:
            # Draw detected boxes
            cv2.putText(origin_image,
                        'confidence: {:.2f}'.format(rectangle[4]),
                        (int(rectangle[0]), int(rectangle[1])),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0))
            cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])),
                          (int(rectangle[2]), int(rectangle[3])), (255, 0, 0),
                          1)
            # Draw landmarks
            for i in range(5, 15, 2):
                cv2.circle(origin_image,
                           (int(rectangle[i + 0]), int(rectangle[i + 1])), 2,
                           (0, 255, 0))

        infer_time = (cv2.getTickCount() -
                      t0) / cv2.getTickFrequency()  # Record infer time
        cv2.putText(origin_image,
                    'summary: {:.1f} FPS'.format(1.0 / infer_time), (5, 15),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 200))

        if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id
                                        <= args.output_limit - 1):
            video_writer.write(origin_image)

        if not args.no_show:
            cv2.imshow('MTCNN Results', origin_image)
            key = cv2.waitKey(1)
            if key in {ord('q'), ord('Q'), 27}:
                break
            presenter.handleKey(key)

        metrics.update(start_time, origin_image)

    metrics.print_total()
Ejemplo n.º 24
0
def main():
    args = build_argparser().parse_args()
    metrics = PerformanceMetrics()

    log.info('Initializing Inference Engine...')
    ie = IECore()

    plugin_config = get_plugin_configs(args.device, args.num_streams,
                                       args.num_threads)

    cap = open_images_capture(args.input, args.loop)

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    log.info('Loading network...')
    model = get_model(ie, args, frame.shape[1] / frame.shape[0])
    hpe_pipeline = AsyncPipeline(ie,
                                 model,
                                 plugin_config,
                                 device=args.device,
                                 max_num_requests=args.num_infer_requests)

    log.info('Starting inference...')
    hpe_pipeline.submit_data(frame, 0, {
        'frame': frame,
        'start_time': start_time
    })
    next_frame_id = 1
    next_frame_id_to_show = 0

    presenter = monitors.Presenter(
        args.utilization_monitors, 55,
        (round(frame.shape[1] / 4), round(frame.shape[0] / 8)))
    video_writer = cv2.VideoWriter()
    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    while True:
        if hpe_pipeline.callback_exceptions:
            raise hpe_pipeline.callback_exceptions[0]
        # Process all completed requests
        results = hpe_pipeline.get_result(next_frame_id_to_show)
        if results:
            (poses, scores), frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if len(poses) and args.raw_output_message:
                print_raw_results(poses, scores)

            presenter.drawGraphs(frame)
            frame = draw_poses(frame, poses, args.prob_threshold)
            metrics.update(start_time, frame)
            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            if not args.no_show:
                cv2.imshow('Pose estimation results', frame)
                key = cv2.waitKey(1)

                ESC_KEY = 27
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)
            next_frame_id_to_show += 1
            continue

        if hpe_pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                break

            # Submit for inference
            hpe_pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1

        else:
            # Wait for empty request
            hpe_pipeline.await_any()

    hpe_pipeline.await_all()
    # Process completed requests
    while hpe_pipeline.has_completed_request():
        results = hpe_pipeline.get_result(next_frame_id_to_show)
        if results:
            (poses, scores), frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            if len(poses) and args.raw_output_message:
                print_raw_results(poses, scores)

            presenter.drawGraphs(frame)
            frame = draw_poses(frame, poses, args.prob_threshold)
            metrics.update(start_time, frame)
            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)
            if not args.no_show:
                cv2.imshow('Pose estimation results', frame)
                key = cv2.waitKey(1)

                ESC_KEY = 27
                # Quit.
                if key in {ord('q'), ord('Q'), ESC_KEY}:
                    break
                presenter.handleKey(key)
            next_frame_id_to_show += 1
        else:
            break

    metrics.print_total()
    print(presenter.reportMeans())
Ejemplo n.º 25
0
        input_source = int(args.input)
    except ValueError:
        input_source = args.input

    cap = cv.VideoCapture(input_source)
    if not cap.isOpened():
        assert "{} not exist".format(input_source)

    color_coeff = np.load(coeffs).astype(np.float32)
    assert color_coeff.shape == (
        313,
        2), "Current shape of color coefficients does not match required shape"

    imshowSize = (640, 480)
    graphSize = (imshowSize[0] // 2, imshowSize[1] // 4)
    presenter = monitors.Presenter(args.utilization_monitors,
                                   imshowSize[1] * 2 - graphSize[1], graphSize)

    while True:
        log.debug("#############################")
        hasFrame, original_frame = cap.read()
        if not hasFrame:
            break
        (h_orig, w_orig) = original_frame.shape[:2]

        log.debug("Preprocessing frame")
        if original_frame.shape[2] > 1:
            frame = cv.cvtColor(cv.cvtColor(original_frame, cv.COLOR_BGR2GRAY),
                                cv.COLOR_GRAY2RGB)
        else:
            frame = cv.cvtColor(original_frame, cv.COLOR_GRAY2RGB)
Ejemplo n.º 26
0
def run_demo(args):
    ie = IECore()
    detector_person = Detector(ie,
                               path_to_model_xml=args.model_od,
                               device=args.device,
                               label_class=args.person_label)

    single_human_pose_estimator = HumanPoseEstimator(
        ie, path_to_model_xml=args.model_hpe, device=args.device)
    cap = open_images_capture(args.input, args.loop)
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")
    delay = int(cap.get_type() in ('VIDEO', 'CAMERA'))

    video_writer = cv2.VideoWriter()
    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    frames_processed = 0
    presenter = monitors.Presenter(args.utilization_monitors, 25)
    while frame is not None:
        bboxes = detector_person.detect(frame)
        human_poses = [
            single_human_pose_estimator.estimate(frame, bbox)
            for bbox in bboxes
        ]

        presenter.drawGraphs(frame)

        colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0)]

        for pose, bbox in zip(human_poses, bboxes):
            cv2.rectangle(frame, (bbox[0], bbox[1]),
                          (bbox[0] + bbox[2], bbox[1] + bbox[3]), (255, 0, 0),
                          2)
            for id_kpt, kpt in enumerate(pose):
                cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 3,
                           colors[id_kpt], -1)

        cv2.putText(
            frame,
            'summary: {:.1f} FPS (estimation: {:.1f} FPS / detection: {:.1f} FPS)'
            .format(
                float(1 / (detector_person.infer_time +
                           single_human_pose_estimator.infer_time *
                           len(human_poses))),
                float(1 / single_human_pose_estimator.infer_time),
                float(1 / detector_person.infer_time)), (5, 15),
            cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 200))

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Human Pose Estimation Demo', frame)
            key = cv2.waitKey(delay)
            if key == 27:
                break
            presenter.handleKey(key)
        frame = cap.read()
    print(presenter.reportMeans())
def main():
    args = build_argparser().parse_args()

    if args.labels:
        with open(args.labels) as f:
            labels = [line.strip() for line in f]
    else:
        labels = None

    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    if 'MYRIAD' in args.device:
        myriad_config = {'MYRIAD_ENABLE_HW_ACCELERATION': 'YES'}
        core.set_property('MYRIAD', myriad_config)

    decoder_target_device = 'CPU'
    if args.device != 'CPU':
        encoder_target_device = args.device
    else:
        encoder_target_device = decoder_target_device

    models = [
        IEModel(args.m_encoder,
                core,
                encoder_target_device,
                model_type='Action Recognition Encoder',
                num_requests=(3 if args.device == 'MYRIAD' else 1))
    ]

    if args.architecture_type == 'en-de':
        if args.m_decoder is None:
            raise RuntimeError(
                'No decoder for encoder-decoder model type (-m_de) provided')
        models.append(
            IEModel(args.m_decoder,
                    core,
                    decoder_target_device,
                    model_type='Action Recognition Decoder',
                    num_requests=2))
        seq_size = models[1].input_shape[1]
    elif args.architecture_type == 'en-mean':
        models.append(DummyDecoder(num_requests=2))
        seq_size = args.decoder_seq_size
    elif args.architecture_type == 'i3d-rgb':
        seq_size = models[0].input_shape[1]

    presenter = monitors.Presenter(args.utilization_monitors, 70)
    result_presenter = ResultRenderer(
        no_show=args.no_show,
        presenter=presenter,
        output=args.output,
        limit=args.output_limit,
        labels=labels,
        label_smoothing_window=args.label_smoothing)
    cap = open_images_capture(args.input, args.loop)
    run_pipeline(cap,
                 args.architecture_type,
                 models,
                 result_presenter.render_frame,
                 args.raw_output_message,
                 seq_size=seq_size,
                 fps=cap.fps())

    for rep in presenter.reportMeans():
        log.info(rep)
Ejemplo n.º 28
0
def main():
    metrics = PerformanceMetrics()
    args = build_argparser().parse_args()

    log.info('Initializing Inference Engine...')
    ie = IECore()

    plugin_config = get_plugin_configs(args.device, args.num_streams,
                                       args.num_threads)

    log.info('Loading network...')

    model = SegmentationModel(ie, args.model)

    pipeline = AsyncPipeline(ie,
                             model,
                             plugin_config,
                             device=args.device,
                             max_num_requests=args.num_infer_requests)

    cap = open_images_capture(args.input, args.loop)

    next_frame_id = 0
    next_frame_id_to_show = 0

    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )

    visualizer = Visualizer(args.colors)
    presenter = None
    video_writer = cv2.VideoWriter()

    while True:
        if pipeline.is_ready():
            # Get new image/frame
            start_time = perf_counter()
            frame = cap.read()
            if frame is None:
                if next_frame_id == 0:
                    raise ValueError("Can't read an image from the input")
                break
            if next_frame_id == 0:
                presenter = monitors.Presenter(
                    args.utilization_monitors, 55,
                    (round(frame.shape[1] / 4), round(frame.shape[0] / 8)))
                if args.output and not video_writer.open(
                        args.output, cv2.VideoWriter_fourcc(*'MJPG'),
                        cap.fps(), (frame.shape[1], frame.shape[0])):
                    raise RuntimeError("Can't open video writer")
            # Submit for inference
            pipeline.submit_data(frame, next_frame_id, {
                'frame': frame,
                'start_time': start_time
            })
            next_frame_id += 1
        else:
            # Wait for empty request
            pipeline.await_any()

        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        # Process all completed requests
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            objects, frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            frame = visualizer.overlay_masks(frame, objects)
            presenter.drawGraphs(frame)
            metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)

            if not args.no_show:
                cv2.imshow('Segmentation Results', frame)
                key = cv2.waitKey(1)
                if key == 27 or key == 'q' or key == 'Q':
                    break
                presenter.handleKey(key)
            next_frame_id_to_show += 1

    pipeline.await_all()
    # Process completed requests
    while pipeline.has_completed_request():
        results = pipeline.get_result(next_frame_id_to_show)
        if results:
            objects, frame_meta = results
            frame = frame_meta['frame']
            start_time = frame_meta['start_time']

            frame = visualizer.overlay_masks(frame, objects)
            presenter.drawGraphs(frame)
            metrics.update(start_time, frame)

            if video_writer.isOpened() and (
                    args.output_limit <= 0
                    or next_frame_id_to_show <= args.output_limit - 1):
                video_writer.write(frame)

            if not args.no_show:
                cv2.imshow('Segmentation Results', frame)
                key = cv2.waitKey(1)
            next_frame_id_to_show += 1
        else:
            break

    metrics.print_total()
    print(presenter.reportMeans())
Ejemplo n.º 29
0
def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('Creating Inference Engine...')
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')
    # Read IR
    log.info('Loading Mask-RCNN network')
    mask_rcnn_net = ie.read_network(
        args.mask_rcnn_model,
        os.path.splitext(args.mask_rcnn_model)[0] + '.bin')

    log.info('Loading encoder part of text recognition network')
    text_enc_net = ie.read_network(
        args.text_enc_model,
        os.path.splitext(args.text_enc_model)[0] + '.bin')

    log.info('Loading decoder part of text recognition network')
    text_dec_net = ie.read_network(
        args.text_dec_model,
        os.path.splitext(args.text_dec_model)[0] + '.bin')

    model_required_inputs = {'image'}
    old_model_required_inputs = {'im_data', 'im_info'}
    if set(mask_rcnn_net.input_info) == model_required_inputs:
        old_model = False
        required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'}
        n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape
    elif set(mask_rcnn_net.input_info) == old_model_required_inputs:
        old_model = True
        required_output_keys = {
            'boxes', 'scores', 'classes', 'raw_masks', 'text_features'
        }
        n, c, h, w = mask_rcnn_net.input_info['im_data'].input_data.shape
        args.alphabet = '  0123456789abcdefghijklmnopqrstuvwxyz'
        args.tr_threshold = 0
    else:
        raise RuntimeError(
            'Demo supports only topologies with the following input keys: '
            f'{model_required_inputs} or {old_model_required_inputs}.')

    assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \
        f'Demo supports only topologies with the following output keys: {required_output_keys}' \
        f'Found: {mask_rcnn_net.outputs.keys()}.'

    assert n == 1, 'Only batch 1 is supported by the demo application'

    log.info('Loading IR to the plugin...')
    mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net,
                                         device_name=args.device,
                                         num_requests=2)
    text_enc_exec_net = ie.load_network(network=text_enc_net,
                                        device_name=args.device)
    text_dec_exec_net = ie.load_network(network=text_dec_net,
                                        device_name=args.device)

    hidden_shape = text_dec_net.input_info[
        args.trd_input_prev_hidden].input_data.shape

    del mask_rcnn_net
    del text_enc_net
    del text_dec_net

    input_source = args.input_source
    if os.path.isdir(input_source):
        cap = FolderCapture(input_source)
    else:
        try:
            input_source = int(args.input_source)
            cap = cv2.VideoCapture(input_source)
            cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
        except ValueError:
            cap = cv2.VideoCapture(input_source)

    if not cap.isOpened():
        raise RuntimeError('Failed to open "{}"'.format(input_source))

    ret, frame = cap.read()
    if not ret:
        raise RuntimeError("Can't read an image from the input")

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    visualizer = Visualizer(['__background__', 'text'],
                            show_boxes=args.show_boxes,
                            show_scores=args.show_scores)

    render_time = 0

    presenter = monitors.Presenter(args.utilization_monitors, 45,
                                   (frame.shape[1] // 4, frame.shape[0] // 8))
    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    while ret:
        if not args.keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image,
                             ((0, h - input_image_size[0]),
                              (0, w - input_image_size[1]), (0, 0)),
                             mode='constant',
                             constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)
        input_image_info = np.asarray(
            [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32)

        # Run the net.
        inf_start = time.time()
        if old_model:
            outputs = mask_rcnn_exec_net.infer({
                'im_data': input_image,
                'im_info': input_image_info
            })
        else:
            outputs = mask_rcnn_exec_net.infer({'image': input_image})

        # Parse detection results of the current request
        if old_model:
            boxes = outputs['boxes']
            scores = outputs['scores']
            classes = outputs['classes'].astype(np.uint32)
            raw_masks = outputs['raw_masks']
            text_features = outputs['text_features']
        else:
            boxes = outputs['boxes'][:, :4]
            scores = outputs['boxes'][:, 4]
            classes = outputs['labels'].astype(np.uint32)
            raw_masks = outputs['masks']
            text_features = outputs['text_features.0']

        # Filter out detections with low confidence.
        detections_filter = scores > args.prob_threshold
        scores = scores[detections_filter]
        classes = classes[detections_filter]
        boxes = boxes[detections_filter]
        raw_masks = raw_masks[detections_filter]
        text_features = text_features[detections_filter]

        boxes[:, 0::2] /= scale_x
        boxes[:, 1::2] /= scale_y
        masks = []
        for box, cls, raw_mask in zip(boxes, classes, raw_masks):
            if old_model:
                raw_mask = raw_mask[cls, ...]
            mask = segm_postprocess(box, raw_mask, frame.shape[0],
                                    frame.shape[1])
            masks.append(mask)

        texts = []
        for feature in text_features:
            feature = text_enc_exec_net.infer({'input': feature})['output']
            feature = np.reshape(feature,
                                 (feature.shape[0], feature.shape[1], -1))
            feature = np.transpose(feature, (0, 2, 1))

            hidden = np.zeros(hidden_shape)
            prev_symbol_index = np.ones((1, )) * SOS_INDEX

            text = ''
            text_confidence = 1.0
            for i in range(MAX_SEQ_LEN):
                decoder_output = text_dec_exec_net.infer({
                    args.trd_input_prev_symbol:
                    prev_symbol_index,
                    args.trd_input_prev_hidden:
                    hidden,
                    args.trd_input_encoder_outputs:
                    feature
                })
                symbols_distr = decoder_output[args.trd_output_symbols_distr]
                symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0]
                prev_symbol_index = int(np.argmax(symbols_distr, axis=1))
                text_confidence *= symbols_distr_softmaxed[prev_symbol_index]
                if prev_symbol_index == EOS_INDEX:
                    break
                text += args.alphabet[prev_symbol_index]
                hidden = decoder_output[args.trd_output_cur_hidden]

            texts.append(text if text_confidence >= args.tr_threshold else '')

        inf_end = time.time()
        inf_time = inf_end - inf_start

        render_start = time.time()

        if len(boxes) and args.raw_output_message:
            log.info('Detected boxes:')
            log.info(
                '  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX '
            )
            for box, cls, score, mask in zip(boxes, classes, scores, masks):
                log.info(
                    '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '
                    .format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        presenter.drawGraphs(frame)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, masks, texts,
                           masks_tracks_ids)

        # Draw performance stats.
        inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format(
            inf_time * 1000)
        render_time_message = 'OpenCV rendering time: {:.3f} ms'.format(
            render_time * 1000)
        cv2.putText(frame, inf_time_message, (15, 15),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
        cv2.putText(frame, render_time_message, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)

        # Print performance counters.
        if args.perf_counts:
            perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts()
            log.info('Performance counters:')
            print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                'name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
            for layer, stats in perf_counts.items():
                print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                    layer, stats['layer_type'], stats['exec_type'],
                    stats['status'], stats['real_time']))

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)
        render_end = time.time()
        render_time = render_end - render_start

        if not args.no_show:
            key = cv2.waitKey(args.delay)
            esc_code = 27
            if key == esc_code:
                break
            presenter.handleKey(key)

        ret, frame = cap.read()

    print(presenter.reportMeans())
    cv2.destroyAllWindows()
    cap.release()
def run_demo(args):
    cap = open_images_capture(args.input, args.loop)

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()

    log.info('Reading Object Detection model {}'.format(args.model_od))
    detector_person = Detector(ie,
                               args.model_od,
                               device=args.device,
                               label_class=args.person_label)
    log.info('The Object Detection model {} is loaded to {}'.format(
        args.model_od, args.device))

    log.info('Reading Human Pose Estimation model {}'.format(args.model_hpe))
    single_human_pose_estimator = HumanPoseEstimator(ie,
                                                     args.model_hpe,
                                                     device=args.device)
    log.info('The Human Pose Estimation model {} is loaded to {}'.format(
        args.model_hpe, args.device))

    delay = int(cap.get_type() in ('VIDEO', 'CAMERA'))
    video_writer = cv2.VideoWriter()

    frames_processed = 0
    presenter = monitors.Presenter(args.utilization_monitors, 25)
    metrics = PerformanceMetrics()

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    while frame is not None:
        bboxes = detector_person.detect(frame)
        human_poses = [
            single_human_pose_estimator.estimate(frame, bbox)
            for bbox in bboxes
        ]

        presenter.drawGraphs(frame)

        colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0),
                  (0, 255, 0)]

        for pose, bbox in zip(human_poses, bboxes):
            cv2.rectangle(frame, (bbox[0], bbox[1]),
                          (bbox[0] + bbox[2], bbox[1] + bbox[3]), (255, 0, 0),
                          2)
            for id_kpt, kpt in enumerate(pose):
                cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 3,
                           colors[id_kpt], -1)

        metrics.update(start_time, frame)

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(frame)

        if not args.no_show:
            cv2.imshow('Human Pose Estimation Demo', frame)
            key = cv2.waitKey(delay)
            if key == 27:
                break
            presenter.handleKey(key)

        start_time = perf_counter()
        frame = cap.read()

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)