def recognize(self, images, detection_kwargs=None, recognition_kwargs=None): """Run the pipeline on one or multiples images. Args: images: The images to parse (can be a list of actual images or a list of filepaths) detection_kwargs: Arguments to pass to the detector call recognition_kwargs: Arguments to pass to the recognizer call Returns: A list of lists of (text, box) tuples. """ # Make sure we have an image array to start with. if not isinstance(images, np.ndarray): images = [tools.read(image) for image in images] # This turns images into (image, scale) tuples temporarily images = [ tools.resize_image(image, max_scale=self.scale, max_size=self.max_size) for image in images ] max_height, max_width = np.array( [image.shape[:2] for image, scale in images]).max(axis=0) scales = [scale for _, scale in images] images = np.array([ tools.pad(image, width=max_width, height=max_height) for image, _ in images ]) if detection_kwargs is None: detection_kwargs = {} if recognition_kwargs is None: recognition_kwargs = {} box_groups = self.detector.detect(images=images, **detection_kwargs) prediction_groups = self.recognizer.recognize_from_boxes( images=images, box_groups=box_groups, **recognition_kwargs) box_groups = [ tools.adjust_boxes( boxes=boxes, boxes_format='boxes', scale=1 / scale) if scale != 1 else boxes for boxes, scale in zip(box_groups, scales) ] return [ list(zip(predictions, boxes)) for predictions, boxes in zip(prediction_groups, box_groups) ]
def get_recognizer_image_generator(labels, height, width, alphabet, augmenter=None, shuffle=True): """Generate augmented (image, text) tuples from a list of (filepath, box, label) tuples. Args: labels: A list of (filepath, box, label) tuples height: The height of the images to return width: The width of the images to return alphabet: The alphabet which limits the characters returned augmenter: The augmenter to apply to images shuffle: Whether to shuffle the dataset on each iteration """ n_with_illegal_characters = sum( any(c not in alphabet for c in text) for _, _, text in labels) if n_with_illegal_characters > 0: print( f'{n_with_illegal_characters} / {len(labels)} instances have illegal characters.' ) labels = labels.copy() for index in itertools.cycle(range(len(labels))): if index == 0 and shuffle: random.shuffle(labels) filepath, box, text = labels[index] cval = cval = np.random.randint(low=0, high=255, size=3).astype('uint8') if box is not None: image = tools.warpBox(image=tools.read(filepath), box=box.astype('float32'), target_height=height, target_width=width, cval=cval) else: image = tools.read_and_fit(filepath_or_array=filepath, width=width, height=height, cval=cval) text = ''.join([c for c in text if c in alphabet]) if not text: continue if augmenter: image = augmenter.augment_image(image) yield (image, text)
def read_images(self, plates_and_positions=None, image_paths=None, index_maximum=None, crop_plate=True): if image_paths is None: image_paths = self.image_paths if plates_and_positions is None: plates_and_positions = self.get_plates_and_positions() if index_maximum is None: index_maximum = self.index_maximum for image_path, plate_and_position in zip( image_paths[:index_maximum], plates_and_positions[:index_maximum]): image = read(image_path) if crop_plate: image = self.crop_plate_func(image, plate_and_position) self.images.append(image) return self.images # np.stack(images)
def recognize_from_boxes(self, images, box_groups, **kwargs) -> typing.List[str]: """Recognize text from images using lists of bounding boxes. Args: images: A list of input images, supplied as numpy arrays with shape (H, W, 3). boxes: A list of groups of boxes, one for each image """ assert len(box_groups) == len(images), \ 'You must provide the same number of box groups as images.' crops = [] start_end = [] for image, boxes in zip(images, box_groups): image = tools.read(image) if self.prediction_model.input_shape[-1] == 1 and image.shape[ -1] == 3: # Convert color to grayscale image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY) for box in boxes: crops.append( tools.warpBox(image=image, box=box, target_height=self.model.input_shape[1], target_width=self.model.input_shape[2])) start = 0 if not start_end else start_end[-1][1] start_end.append((start, start + len(boxes))) if not crops: return [[] for image in images] X = np.float32(crops) / 255 if len(X.shape) == 3: X = X[..., np.newaxis] predictions = [ ''.join([ self.alphabet[idx] for idx in row if idx not in [self.blank_label_idx, -1] ]) for row in self.prediction_model.predict(X, **kwargs) ] return [predictions[start:end] for start, end in start_end]
def detect(self, images: typing.List[typing.Union[np.ndarray, str]], detection_threshold=0.7, text_threshold=0.4, link_threshold=0.4, size_threshold=10, **kwargs): """Recognize the text in a set of images. Args: images: Can be a list of numpy arrays of shape HxWx3 or a list of filepaths. link_threshold: This is the same as `text_threshold`, but is applied to the link map instead of the text map. detection_threshold: We want to avoid including boxes that may have represented large regions of low confidence text predictions. To do this, we do a final check for each word box to make sure the maximum confidence value exceeds some detection threshold. This is the threshold used for this check. text_threshold: When the text map is processed, it is converted from confidence (float from zero to one) values to classification (0 for not text, 1 for text) using binary thresholding. The threshold value determines the breakpoint at which a value is converted to a 1 or a 0. For example, if the threshold is 0.4 and a value for particular point on the text map is 0.5, that value gets converted to a 1. The higher this value is, the less likely it is that characters will be merged together into a single word. The lower this value is, the more likely it is that non-text will be detected. Therein lies the balance. size_threshold: The minimum area for a word. """ images = [compute_input(tools.read(image)) for image in images] boxes = getBoxes(self.model.predict(np.array(images), **kwargs), detection_threshold=detection_threshold, text_threshold=text_threshold, link_threshold=link_threshold, size_threshold=size_threshold) return boxes
def recognize(self, images, detection_kwargs=None, recognition_kwargs=None): """Run the pipeline on one or multiples images. Args: images: The images to parse (can be a list of actual images or a list of filepaths) detection_kwargs: Arguments to pass to the detector call recognition_kwargs: Arguments to pass to the recognizer call Returns: A list of lists of (text, box) tuples. """ # Make sure we have an image array to start with. if not isinstance( images, np.ndarray ): #chengbin: If images are not array. we read image from the file path. images = [tools.read(image) for image in images] #chengbin: list of numpy array. # This turns images into (image, scale) tuples temporarily images = [ tools.resize_image(image, max_scale=self.scale, max_size=self.max_size) for image in images ] #chengbin: image has to resize: max_size: 2048, scale: 2 max_height, max_width = np.array( [image.shape[:2] for image, scale in images]).max(axis=0) scales = [scale for _, scale in images] images = np.array([ tools.pad(image, width=max_width, height=max_height) for image, _ in images ]) if detection_kwargs is None: detection_kwargs = {} if recognition_kwargs is None: recognition_kwargs = {} box_groups = self.detector.detect(images=images, **detection_kwargs) #assert len(box_groups) == len(images), 'You must provide the same number of box groups as images.' crops = [] start_end = [] for image, boxes in zip(images, box_groups): image = tools.read(image) if self.prediction_model.input_shape[-1] == 1 and image.shape[ -1] == 3: # Convert color to grayscale image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY) print("This is prediction model input shape", self.prediction_model.input_shape) for box in boxes: crops.append( tools.warpBox(image=image, box=box, target_height=self.model.input_shape[1], target_width=self.model.input_shape[2])) start = 0 if not start_end else start_end[-1][1] start_end.append((start, start + len(boxes))) if not crops: return [[] for image in images] print("this is crops", crops, np.asarray(crops).shape) X = np.float32(crops) / 255 if len(X.shape) == 3: X = X[..., np.newaxis] #predictions = [''.join([self.alphabet[idx] for idx in row if idx not in [self.blank_label_idx, -1]]) for row in self.prediction_model.predict(X, **recognition_kwargs)] rows = self.prediction_model.predict(np.asarray([X[0]])) #print("length of X",len(X[0]),"This is prediciton rows.",rows[0][0]) for r in rows[0]: maxrid = 0 maxr = r[0] for i in range(len(r)): if r[i] > maxr: maxr = r[i] maxrid = i print("max row value", maxr, "max r id", maxrid) return rows #[predictions[start:end] for start, end in start_end]
def get_detector_image_generator(labels, width, height, augmenter=None, area_threshold=0.5, focused=False, min_area=None, shuffle=True): """Generated augmented (image, lines) tuples from a list of (filepath, lines, confidence) tuples. Confidence is not used right now but is included for a future release that uses semi-supervised data. Args: labels: A list of (image, lines, confience) tuples. augmenter: An augmenter to apply to the images. width: The width to use for output images height: The height to use for output images area_threshold: The area threshold to use to keep characters in augmented images. min_area: The minimum area for a character to be included. focused: Whether to pre-crop images to width/height containing a region containing text. shuffle: Whether to shuffle the data on each iteration. """ labels = labels.copy() for index in itertools.cycle(range(len(labels))): if index == 0 and shuffle: random.shuffle(labels) image_filepath, lines, confidence = labels[index] image = tools.read(image_filepath) if augmenter is not None: image, lines = tools.augment(boxes=lines, boxes_format='lines', image=image, area_threshold=area_threshold, min_area=min_area, augmenter=augmenter) if focused: boxes = [tools.combine_line(line)[0] for line in lines] if boxes: selected = np.array(boxes[np.random.choice(len(boxes))]) left, top = selected.min(axis=0).clip(0, np.inf).astype('int') if left > 0: left -= np.random.randint(0, min(left, width / 2)) if top > 0: top -= np.random.randint(0, min(top, height / 2)) image, lines = tools.augment( boxes=lines, augmenter=imgaug.augmenters.Sequential([ imgaug.augmenters.Crop(px=(int(top), 0, 0, int(left))), imgaug.augmenters.CropToFixedSize( width=width, height=height, position='right-bottom') ]), boxes_format='lines', image=image, min_area=min_area, area_threshold=area_threshold) image, scale = tools.fit(image, width=width, height=height, mode='letterbox', return_scale=True) lines = tools.adjust_boxes(boxes=lines, boxes_format='lines', scale=scale) yield image, lines, confidence
def get_image_generator( height, width, font_groups, text_generator, font_size: typing.Union[int, typing.Tuple[int, int]] = 18, backgrounds: typing.List[typing.Union[str, np.ndarray]] = None, background_crop_mode='crop', rotationX: typing.Union[int, typing.Tuple[int, int]] = 0, rotationY: typing.Union[int, typing.Tuple[int, int]] = 0, rotationZ: typing.Union[int, typing.Tuple[int, int]] = 0, margin=0, use_ligatures=False, augmenter=None, draw_contour=False, draw_contour_text=False): """Create a generator for images containing text. Args: height: The height of the generated image width: The width of the generated image. font_groups: A dict mapping of { subalphabet: [path_to_font1, path_to_font2] }. text_generator: See get_text_generator font_size: The font size to use. Alternative, supply a tuple and the font size will be randomly selected between the two values. backgrounds: A list of paths to image backgrounds or actual images as numpy arrays with channels in RGB order. background_crop_mode: One of letterbox or crop, indicates how backgrounds will be resized to fit on the canvas. rotationX: The X-axis text rotation to use. Alternative, supply a tuple and the rotation will be randomly selected between the two values. rotationY: The Y-axis text rotation to use. Alternative, supply a tuple and the rotation will be randomly selected between the two values. rotationZ: The Z-axis text rotation to use. Alternative, supply a tuple and the rotation will be randomly selected between the two values. margin: The minimum margin around the edge of the image. use_ligatures: Whether to render ligatures (see `draw_text_image`) augmenter: An image augmenter to be applied to backgrounds draw_contour: Draw the permitted contour onto images (debugging only) draw_contour_text: Draw the permitted contour inside the text drawing function. Yields: Tuples of (image, lines) where image is the transparent text image and lines is a list of lines where each line itself is a list of (box, character) tuples and box is an array of points with shape (4, 2) providing the coordinates of the character box in clockwise order starting from the top left. """ if backgrounds is None: backgrounds = [np.zeros((height, width, 3), dtype='uint8')] alphabet = ''.join(font_groups.keys()) assert len(set(alphabet)) == len( alphabet ), 'Each character can appear in the subalphabet for only one font group.' for text, background_index, current_font_groups in zip( text_generator, itertools.cycle(range(len(backgrounds))), zip(*[ itertools.cycle([(subalphabet, font_filepath) for font_filepath in font_group_filepaths]) for subalphabet, font_group_filepaths in font_groups.items() ])): if background_index == 0: random.shuffle(backgrounds) current_font_groups = dict(current_font_groups) current_font_size = np.random.randint( low=font_size[0], high=font_size[1]) if isinstance( font_size, tuple) else font_size current_rotation_X, current_rotation_Y, current_rotation_Z = [ (np.random.uniform(low=rotation[0], high=rotation[1]) if isinstance(rotation, tuple) else rotation) * np.pi / 180 for rotation in [rotationX, rotationY, rotationZ] ] current_background_filepath_or_array = backgrounds[background_index] current_background = tools.read( current_background_filepath_or_array) if isinstance( current_background_filepath_or_array, str) else current_background_filepath_or_array if augmenter is not None: current_background = augmenter(images=[current_background])[0] if current_background.shape[0] != height or current_background.shape[ 1] != width: current_background = tools.fit(current_background, width=width, height=height, mode=background_crop_mode) permitted_contour, isDark = get_maximum_uniform_contour( image=current_background, fontsize=current_font_size, margin=margin) if permitted_contour is None: # We can't draw on this background. Boo! continue random_color_values = np.random.randint(low=0, high=50, size=3) text_color = tuple(np.array([255, 255, 255]) - random_color_values) if isDark else tuple( random_color_values) text_image, lines = draw_text_image( text=text, width=width, height=height, fontsize=current_font_size, fonts=current_font_groups, thetaX=current_rotation_X, thetaY=current_rotation_Y, thetaZ=current_rotation_Z, use_ligatures=use_ligatures, permitted_contour=permitted_contour, color=text_color, draw_contour=draw_contour_text) alpha = text_image[..., -1:].astype('float32') / 255 image = (alpha * text_image[..., :3] + (1 - alpha) * current_background).astype('uint8') if draw_contour: image = cv2.drawContours(image, contours=[ permitted_contour.reshape( (-1, 1, 2)).astype('int32') ], contourIdx=0, color=(255, 0, 0), thickness=int(width / 100)) yield image, lines