def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) # Plugin initialization for specified device and load extensions library if specified. log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Reading Mask-RCNN model {}'.format(args.mask_rcnn_model)) mask_rcnn_net = ie.read_network(args.mask_rcnn_model) model_required_inputs = {'image'} if set(mask_rcnn_net.input_info) == model_required_inputs: required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'} n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape assert n == 1, 'Only batch 1 is supported by the demo application' else: raise RuntimeError( 'Demo supports only topologies with the following input keys: ' f'{model_required_inputs}.') assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \ f'Demo supports only topologies with the following output keys: {required_output_keys}' \ f'Found: {mask_rcnn_net.outputs.keys()}.' log.info('Reading Text Recognition Encoder model {}'.format( args.text_enc_model)) text_enc_net = ie.read_network(args.text_enc_model) log.info('Reading Text Recognition Decoder model {}'.format( args.text_dec_model)) text_dec_net = ie.read_network(args.text_dec_model) mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net, device_name=args.device, num_requests=2) log.info('The Mask-RCNN model {} is loaded to {}'.format( args.mask_rcnn_model, args.device)) text_enc_exec_net = ie.load_network(network=text_enc_net, device_name=args.device) log.info('The Text Recognition Encoder model {} is loaded to {}'.format( args.text_enc_model, args.device)) text_dec_exec_net = ie.load_network(network=text_dec_net, device_name=args.device) log.info('The Text Recognition Decoder model {} is loaded to {}'.format( args.text_dec_model, args.device)) hidden_shape = text_dec_net.input_info[ args.trd_input_prev_hidden].input_data.shape del mask_rcnn_net del text_enc_net del text_dec_net if args.no_track: tracker = None else: tracker = StaticIOUTracker() if args.delay: delay = args.delay else: delay = int(cap.get_type() in ('VIDEO', 'CAMERA')) visualizer = Visualizer(['__background__', 'text'], show_boxes=args.show_boxes, show_scores=args.show_scores) frames_processed = 0 metrics = PerformanceMetrics() video_writer = cv2.VideoWriter() start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") presenter = monitors.Presenter(args.utilization_monitors, 45, (frame.shape[1] // 4, frame.shape[0] // 8)) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") while frame is not None: if not args.keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) # Run the net. outputs = mask_rcnn_exec_net.infer({'image': input_image}) # Parse detection results of the current request boxes = outputs['boxes'][:, :4] scores = outputs['boxes'][:, 4] classes = outputs['labels'].astype(np.uint32) raw_masks = outputs['masks'] text_features = outputs['text_features.0'] # Filter out detections with low confidence. detections_filter = scores > args.prob_threshold scores = scores[detections_filter] classes = classes[detections_filter] boxes = boxes[detections_filter] raw_masks = raw_masks[detections_filter] text_features = text_features[detections_filter] boxes[:, 0::2] /= scale_x boxes[:, 1::2] /= scale_y masks = [] for box, cls, raw_mask in zip(boxes, classes, raw_masks): mask = segm_postprocess(box, raw_mask, frame.shape[0], frame.shape[1]) masks.append(mask) texts = [] for feature in text_features: feature = text_enc_exec_net.infer({'input': feature})['output'] feature = np.reshape(feature, (feature.shape[0], feature.shape[1], -1)) feature = np.transpose(feature, (0, 2, 1)) hidden = np.zeros(hidden_shape) prev_symbol_index = np.ones((1, )) * SOS_INDEX text = '' text_confidence = 1.0 for i in range(MAX_SEQ_LEN): decoder_output = text_dec_exec_net.infer({ args.trd_input_prev_symbol: prev_symbol_index, args.trd_input_prev_hidden: hidden, args.trd_input_encoder_outputs: feature }) symbols_distr = decoder_output[args.trd_output_symbols_distr] symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0] prev_symbol_index = int(np.argmax(symbols_distr, axis=1)) text_confidence *= symbols_distr_softmaxed[prev_symbol_index] if prev_symbol_index == EOS_INDEX: break text += args.alphabet[prev_symbol_index] hidden = decoder_output[args.trd_output_cur_hidden] texts.append(text if text_confidence >= args.tr_threshold else '') if len(boxes) and args.raw_output_message: log.debug( ' -------------------------- Frame # {} -------------------------- ' .format(frames_processed)) log.debug( ' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ' ) for box, cls, score, mask in zip(boxes, classes, scores, masks): log.debug( '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} ' .format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) presenter.drawGraphs(frame) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, masks, texts, masks_tracks_ids) metrics.update(start_time, frame) frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(frame) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) if not args.no_show: key = cv2.waitKey(delay) esc_code = 27 if key == esc_code: break presenter.handleKey(key) start_time = perf_counter() frame = cap.read() metrics.log_total() for rep in presenter.reportMeans(): log.info(rep) cv2.destroyAllWindows()
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() mask_rcnn_model_xml = args.mask_rcnn_model mask_rcnn_model_bin = os.path.splitext(mask_rcnn_model_xml)[0] + '.bin' text_enc_model_xml = args.text_enc_model text_enc_model_bin = os.path.splitext(text_enc_model_xml)[0] + '.bin' text_dec_model_xml = args.text_dec_model text_dec_model_bin = os.path.splitext(text_dec_model_xml)[0] + '.bin' # Plugin initialization for specified device and load extensions library if specified. log.info('Creating Inference Engine...') ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Loading network files:\n\t{}\n\t{}'.format( mask_rcnn_model_xml, mask_rcnn_model_bin)) mask_rcnn_net = IENetwork(model=mask_rcnn_model_xml, weights=mask_rcnn_model_bin) log.info('Loading network files:\n\t{}\n\t{}'.format( text_enc_model_xml, text_enc_model_bin)) text_enc_net = IENetwork(model=text_enc_model_xml, weights=text_enc_model_bin) log.info('Loading network files:\n\t{}\n\t{}'.format( text_dec_model_xml, text_dec_model_bin)) text_dec_net = IENetwork(model=text_dec_model_xml, weights=text_dec_model_bin) if 'CPU' in args.device: supported_layers = ie.query_network(mask_rcnn_net, 'CPU') not_supported_layers = [ l for l in mask_rcnn_net.layers.keys() if l not in supported_layers ] if len(not_supported_layers) != 0: log.error( 'Following layers are not supported by the plugin for specified device {}:\n {}' .format(args.device, ', '.join(not_supported_layers))) log.error( "Please try to specify cpu extensions library path in sample's command line parameters using -l " "or --cpu_extension command line argument") sys.exit(1) required_input_keys = {'im_data', 'im_info'} assert required_input_keys == set(mask_rcnn_net.inputs.keys()), \ 'Demo supports only topologies with the following input keys: {}'.format(', '.join(required_input_keys)) required_output_keys = { 'boxes', 'scores', 'classes', 'raw_masks', 'text_features' } assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \ 'Demo supports only topologies with the following output keys: {}'.format(', '.join(required_output_keys)) n, c, h, w = mask_rcnn_net.inputs['im_data'].shape assert n == 1, 'Only batch 1 is supported by the demo application' log.info('Loading IR to the plugin...') mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net, device_name=args.device, num_requests=2) text_enc_exec_net = ie.load_network(network=text_enc_net, device_name=args.device) text_dec_exec_net = ie.load_network(network=text_dec_net, device_name=args.device) hidden_shape = text_dec_net.inputs[args.trd_input_prev_hidden].shape del mask_rcnn_net del text_enc_net del text_dec_net try: input_source = int(args.input_source) except ValueError: input_source = args.input_source if os.path.isdir(input_source): cap = FolderCapture(input_source) else: cap = cv2.VideoCapture(input_source) if not cap.isOpened(): log.error('Failed to open "{}"'.format(args.input_source)) if isinstance(cap, cv2.VideoCapture): cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) if args.no_track: tracker = None else: tracker = StaticIOUTracker() visualizer = Visualizer(['__background__', 'text'], show_boxes=args.show_boxes, show_scores=args.show_scores) render_time = 0 log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) while cap.isOpened(): ret, frame = cap.read() if not ret: break if not args.keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) input_image_info = np.asarray( [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32) # Run the net. inf_start = time.time() outputs = mask_rcnn_exec_net.infer({ 'im_data': input_image, 'im_info': input_image_info }) # Parse detection results of the current request boxes = outputs['boxes'] scores = outputs['scores'] classes = outputs['classes'].astype(np.uint32) raw_masks = outputs['raw_masks'] text_features = outputs['text_features'] # Filter out detections with low confidence. detections_filter = scores > args.prob_threshold scores = scores[detections_filter] classes = classes[detections_filter] boxes = boxes[detections_filter] raw_masks = raw_masks[detections_filter] text_features = text_features[detections_filter] boxes[:, 0::2] /= scale_x boxes[:, 1::2] /= scale_y masks = [] for box, cls, raw_mask in zip(boxes, classes, raw_masks): raw_cls_mask = raw_mask[cls, ...] mask = segm_postprocess(box, raw_cls_mask, frame.shape[0], frame.shape[1]) masks.append(mask) texts = [] for feature in text_features: feature = text_enc_exec_net.infer({'input': feature})['output'] feature = np.reshape(feature, (feature.shape[0], feature.shape[1], -1)) feature = np.transpose(feature, (0, 2, 1)) hidden = np.zeros(hidden_shape) prev_symbol_index = np.ones((1, )) * SOS_INDEX text = '' for i in range(MAX_SEQ_LEN): decoder_output = text_dec_exec_net.infer({ args.trd_input_prev_symbol: prev_symbol_index, args.trd_input_prev_hidden: hidden, args.trd_input_encoder_outputs: feature }) symbols_distr = decoder_output[args.trd_output_symbols_distr] prev_symbol_index = int(np.argmax(symbols_distr, axis=1)) if prev_symbol_index == EOS_INDEX: break text += args.alphabet[prev_symbol_index] hidden = decoder_output[args.trd_output_cur_hidden] texts.append(text) inf_end = time.time() inf_time = inf_end - inf_start render_start = time.time() if len(boxes) and args.raw_output_message: log.info('Detected boxes:') log.info( ' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ' ) for box, cls, score, mask in zip(boxes, classes, scores, masks): log.info( '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} ' .format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, masks, texts, masks_tracks_ids) # Draw performance stats. inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format( inf_time * 1000) render_time_message = 'OpenCV rendering time: {:.3f} ms'.format( render_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) # Print performance counters. if args.perf_counts: perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts() log.info('Performance counters:') print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( 'name', 'layer_type', 'exet_type', 'status', 'real_time, us')) for layer, stats in perf_counts.items(): print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( layer, stats['layer_type'], stats['exec_type'], stats['status'], stats['real_time'])) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) render_end = time.time() render_time = render_end - render_start if not args.no_show: key = cv2.waitKey(args.delay) esc_code = 27 if key == esc_code: break cv2.destroyAllWindows() cap.release()
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() # Plugin initialization for specified device and load extensions library if specified. log.info('Creating Inference Engine...') ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Loading Mask-RCNN network') mask_rcnn_net = ie.read_network( args.mask_rcnn_model, os.path.splitext(args.mask_rcnn_model)[0] + '.bin') log.info('Loading encoder part of text recognition network') text_enc_net = ie.read_network( args.text_enc_model, os.path.splitext(args.text_enc_model)[0] + '.bin') log.info('Loading decoder part of text recognition network') text_dec_net = ie.read_network( args.text_dec_model, os.path.splitext(args.text_dec_model)[0] + '.bin') model_required_inputs = {'image'} old_model_required_inputs = {'im_data', 'im_info'} if set(mask_rcnn_net.input_info) == model_required_inputs: old_model = False required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'} n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape elif set(mask_rcnn_net.input_info) == old_model_required_inputs: old_model = True required_output_keys = { 'boxes', 'scores', 'classes', 'raw_masks', 'text_features' } n, c, h, w = mask_rcnn_net.input_info['im_data'].input_data.shape args.alphabet = ' 0123456789abcdefghijklmnopqrstuvwxyz' args.tr_threshold = 0 else: raise RuntimeError( 'Demo supports only topologies with the following input keys: ' f'{model_required_inputs} or {old_model_required_inputs}.') assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \ f'Demo supports only topologies with the following output keys: {required_output_keys}' \ f'Found: {mask_rcnn_net.outputs.keys()}.' assert n == 1, 'Only batch 1 is supported by the demo application' log.info('Loading IR to the plugin...') mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net, device_name=args.device, num_requests=2) text_enc_exec_net = ie.load_network(network=text_enc_net, device_name=args.device) text_dec_exec_net = ie.load_network(network=text_dec_net, device_name=args.device) hidden_shape = text_dec_net.input_info[ args.trd_input_prev_hidden].input_data.shape del mask_rcnn_net del text_enc_net del text_dec_net input_source = args.input_source if os.path.isdir(input_source): cap = FolderCapture(input_source) else: try: input_source = int(args.input_source) cap = cv2.VideoCapture(input_source) cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) except ValueError: cap = cv2.VideoCapture(input_source) if not cap.isOpened(): raise RuntimeError('Failed to open "{}"'.format(input_source)) ret, frame = cap.read() if not ret: raise RuntimeError("Can't read an image from the input") if args.no_track: tracker = None else: tracker = StaticIOUTracker() visualizer = Visualizer(['__background__', 'text'], show_boxes=args.show_boxes, show_scores=args.show_scores) render_time = 0 presenter = monitors.Presenter(args.utilization_monitors, 45, (frame.shape[1] // 4, frame.shape[0] // 8)) log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) while ret: if not args.keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) input_image_info = np.asarray( [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32) # Run the net. inf_start = time.time() if old_model: outputs = mask_rcnn_exec_net.infer({ 'im_data': input_image, 'im_info': input_image_info }) else: outputs = mask_rcnn_exec_net.infer({'image': input_image}) # Parse detection results of the current request if old_model: boxes = outputs['boxes'] scores = outputs['scores'] classes = outputs['classes'].astype(np.uint32) raw_masks = outputs['raw_masks'] text_features = outputs['text_features'] else: boxes = outputs['boxes'][:, :4] scores = outputs['boxes'][:, 4] classes = outputs['labels'].astype(np.uint32) raw_masks = outputs['masks'] text_features = outputs['text_features.0'] # Filter out detections with low confidence. detections_filter = scores > args.prob_threshold scores = scores[detections_filter] classes = classes[detections_filter] boxes = boxes[detections_filter] raw_masks = raw_masks[detections_filter] text_features = text_features[detections_filter] boxes[:, 0::2] /= scale_x boxes[:, 1::2] /= scale_y masks = [] for box, cls, raw_mask in zip(boxes, classes, raw_masks): if old_model: raw_mask = raw_mask[cls, ...] mask = segm_postprocess(box, raw_mask, frame.shape[0], frame.shape[1]) masks.append(mask) texts = [] for feature in text_features: feature = text_enc_exec_net.infer({'input': feature})['output'] feature = np.reshape(feature, (feature.shape[0], feature.shape[1], -1)) feature = np.transpose(feature, (0, 2, 1)) hidden = np.zeros(hidden_shape) prev_symbol_index = np.ones((1, )) * SOS_INDEX text = '' text_confidence = 1.0 for i in range(MAX_SEQ_LEN): decoder_output = text_dec_exec_net.infer({ args.trd_input_prev_symbol: prev_symbol_index, args.trd_input_prev_hidden: hidden, args.trd_input_encoder_outputs: feature }) symbols_distr = decoder_output[args.trd_output_symbols_distr] symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0] prev_symbol_index = int(np.argmax(symbols_distr, axis=1)) text_confidence *= symbols_distr_softmaxed[prev_symbol_index] if prev_symbol_index == EOS_INDEX: break text += args.alphabet[prev_symbol_index] hidden = decoder_output[args.trd_output_cur_hidden] texts.append(text if text_confidence >= args.tr_threshold else '') inf_end = time.time() inf_time = inf_end - inf_start render_start = time.time() if len(boxes) and args.raw_output_message: log.info('Detected boxes:') log.info( ' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ' ) for box, cls, score, mask in zip(boxes, classes, scores, masks): log.info( '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} ' .format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) presenter.drawGraphs(frame) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, masks, texts, masks_tracks_ids) # Draw performance stats. inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format( inf_time * 1000) render_time_message = 'OpenCV rendering time: {:.3f} ms'.format( render_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) # Print performance counters. if args.perf_counts: perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts() log.info('Performance counters:') print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( 'name', 'layer_type', 'exet_type', 'status', 'real_time, us')) for layer, stats in perf_counts.items(): print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( layer, stats['layer_type'], stats['exec_type'], stats['status'], stats['real_time'])) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) render_end = time.time() render_time = render_end - render_start if not args.no_show: key = cv2.waitKey(args.delay) esc_code = 27 if key == esc_code: break presenter.handleKey(key) ret, frame = cap.read() print(presenter.reportMeans()) cv2.destroyAllWindows() cap.release()