class GroundTruthDataset(Dataset): """ Dataset for ground truth used during training. All data is cached in memory. """ def __init__(self, split: Callable[[str], str] = lambda x: os.path.splitext(x)[0], suffix: str = '.gt.txt', normalization: Optional[str] = None, whitespace_normalization: bool = True, reorder: bool = True, im_transforms: Callable[[Any], torch.Tensor] = transforms.Compose([]), preload: bool = True) -> None: """ Reads a list of image-text pairs and creates a ground truth set. Args: split (func): Function for generating the base name without extensions from paths suffix (str): Suffix to attach to image base name for text retrieval mode (str): Image color space. Either RGB (color) or L (grayscale/bw). Only L is compatible with vertical scaling/dewarping. scale (int, tuple): Target height or (width, height) of dewarped line images. Vertical-only scaling is through CenterLineNormalizer, resizing with Lanczos interpolation. Set to 0 to disable. normalization (str): Unicode normalization for gt whitespace_normalization (str): Normalizes unicode whitespace and strips whitespace. reorder (bool): Whether to rearrange code points in "display"/LTR order im_transforms (func): Function taking an PIL.Image and returning a tensor suitable for forward passes. preload (bool): Enables preloading and preprocessing of image files. """ self.suffix = suffix self.split = lambda x: split(x) + self.suffix self._images = [] # type: Union[List[Image], List[torch.Tensor]] self._gt = [] # type: List[str] self.alphabet = Counter() # type: Counter self.text_transforms = [] # type: List[Callable[[str], str]] self.transforms = im_transforms self.preload = preload # built text transformations if normalization: self.text_transforms.append(lambda x: unicodedata.normalize(cast(str, normalization), x)) if whitespace_normalization: self.text_transforms.append(lambda x: regex.sub('\s', ' ', x).strip()) if reorder: self.text_transforms.append(bd.get_display) def add(self, image: str) -> None: """ Adds a line-image-text pair to the dataset. Args: image (str): Input image path """ with open(self.split(image), 'r', encoding='utf-8') as fp: gt = fp.read().strip('\n\r') for func in self.text_transforms: gt = func(gt) if not gt: raise KrakenInputException('Text line is empty ({})'.format(fp.name)) if self.preload: im = Image.open(image) try: im = self.transforms(im) except ValueError as e: raise KrakenInputException('Image transforms failed on {}'.format(image)) self._images.append(im) else: self._images.append(image) self._gt.append(gt) self.alphabet.update(gt) def add_loaded(self, image: Image.Image, gt: str) -> None: """ Adds an already loaded line-image-text pair to the dataset. Args: image (PIL.Image.Image): Line image gt (str): Text contained in the line image """ if self.preload: try: im = self.transforms(image) except ValueError as e: raise KrakenInputException('Image transforms failed on {}'.format(image)) self._images.append(im) else: self._images.append(image) for func in self.text_transforms: gt = func(gt) self._gt.append(gt) self.alphabet.update(gt) def encode(self, codec: Optional[PytorchCodec] = None) -> None: """ Adds a codec to the dataset and encodes all text lines. Has to be run before sampling from the dataset. """ if codec: self.codec = codec else: self.codec = PytorchCodec(''.join(self.alphabet.keys())) self.training_set = [] # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, self.codec.encode(gt))) def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: return self.training_set[index] else: item = self.training_set[index] try: logger.debug('Attempting to load {}'.format(item[0])) im = item[0] if not isinstance(im, Image.Image): im = Image.open(im) return (self.transforms(im), item[1]) except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug('Failed. Replacing with sample {}'.format(idx)) return self[np.random.randint(0, len(self.training_set))] def __len__(self) -> int: return len(self.training_set)
class GroundTruthDataset(Dataset): """ Dataset for training a line recognition model. All data is cached in memory. """ def __init__(self, split: Callable[[str], str] = lambda x: path.splitext(x)[0], suffix: str = '.gt.txt', normalization: Optional[str] = None, whitespace_normalization: bool = True, reorder: bool = True, im_transforms: Callable[[Any], torch.Tensor] = transforms.Compose( []), preload: bool = True, augmentation: bool = False) -> None: """ Reads a list of image-text pairs and creates a ground truth set. Args: split (func): Function for generating the base name without extensions from paths suffix (str): Suffix to attach to image base name for text retrieval mode (str): Image color space. Either RGB (color) or L (grayscale/bw). Only L is compatible with vertical scaling/dewarping. scale (int, tuple): Target height or (width, height) of dewarped line images. Vertical-only scaling is through CenterLineNormalizer, resizing with Lanczos interpolation. Set to 0 to disable. normalization (str): Unicode normalization for gt whitespace_normalization (str): Normalizes unicode whitespace and strips whitespace. reorder (bool): Whether to rearrange code points in "display"/LTR order im_transforms (func): Function taking an PIL.Image and returning a tensor suitable for forward passes. preload (bool): Enables preloading and preprocessing of image files. """ self.suffix = suffix self.split = lambda x: split(x) + self.suffix self._images = [] # type: Union[List[Image], List[torch.Tensor]] self._gt = [] # type: List[str] self.alphabet = Counter() # type: Counter self.text_transforms = [] # type: List[Callable[[str], str]] # split image transforms into two. one part giving the final PIL image # before conversion to a tensor and the actual tensor conversion part. self.head_transforms = transforms.Compose(im_transforms.transforms[:2]) self.tail_transforms = transforms.Compose(im_transforms.transforms[2:]) self.aug = None self.preload = preload self.seg_type = 'bbox' # built text transformations if normalization: self.text_transforms.append( lambda x: unicodedata.normalize(cast(str, normalization), x)) if whitespace_normalization: self.text_transforms.append( lambda x: regex.sub('\s', ' ', x).strip()) if reorder: self.text_transforms.append(bd.get_display) if augmentation: from albumentations import ( Compose, ToFloat, FromFloat, Flip, OneOf, MotionBlur, MedianBlur, Blur, ShiftScaleRotate, OpticalDistortion, ElasticTransform, RandomBrightnessContrast, ) self.aug = Compose([ ToFloat(), OneOf([ MotionBlur(p=0.2), MedianBlur(blur_limit=3, p=0.1), Blur(blur_limit=3, p=0.1), ], p=0.2), ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2), OneOf([ OpticalDistortion(p=0.3), ElasticTransform(p=0.1), ], p=0.2), ], p=0.5) self.im_mode = '1' def add(self, image: Union[str, Image.Image], *args, **kwargs) -> None: """ Adds a line-image-text pair to the dataset. Args: image (str): Input image path """ with open(self.split(image), 'r', encoding='utf-8') as fp: gt = fp.read().strip('\n\r') for func in self.text_transforms: gt = func(gt) if not gt: raise KrakenInputException(f'Text line is empty ({fp.name})') if self.preload: try: im = Image.open(image) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( f'Image transforms failed on {image}') self._images.append(im) else: self._images.append(image) self._gt.append(gt) self.alphabet.update(gt) def add_loaded(self, image: Image.Image, gt: str) -> None: """ Adds an already loaded line-image-text pair to the dataset. Args: image (PIL.Image.Image): Line image gt (str): Text contained in the line image """ if self.preload: try: im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( f'Image transforms failed on {image}') self._images.append(im) else: self._images.append(image) for func in self.text_transforms: gt = func(gt) self._gt.append(gt) self.alphabet.update(gt) def encode(self, codec: Optional[PytorchCodec] = None) -> None: """ Adds a codec to the dataset and encodes all text lines. Has to be run before sampling from the dataset. """ if codec: self.codec = codec else: self.codec = PytorchCodec(''.join(self.alphabet.keys())) self.training_set = [ ] # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, self.codec.encode(gt))) def no_encode(self) -> None: """ Creates an unencoded dataset. """ self.training_set = [ ] # type: List[Tuple[Union[Image, torch.Tensor], str]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, gt)) def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: im = x.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': y} return {'image': x, 'target': y} else: item = self.training_set[index] try: logger.debug(f'Attempting to load {item[0]}') im = item[0] if not isinstance(im, Image.Image): im = Image.open(im) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug(traceback.format_exc()) logger.info(f'Failed. Replacing with sample {idx}') return self[np.random.randint(0, len(self.training_set))] def __len__(self) -> int: return len(self.training_set)
class PolygonGTDataset(Dataset): """ Dataset for training a line recognition model from polygonal/baseline data. """ def __init__(self, normalization: Optional[str] = None, whitespace_normalization: bool = True, reorder: bool = True, im_transforms: Callable[[Any], torch.Tensor] = transforms.Compose( []), preload: bool = True, augmentation: bool = False) -> None: self._images = [] # type: Union[List[Image], List[torch.Tensor]] self._gt = [] # type: List[str] self.alphabet = Counter() # type: Counter self.text_transforms = [] # type: List[Callable[[str], str]] # split image transforms into two. one part giving the final PIL image # before conversion to a tensor and the actual tensor conversion part. self.head_transforms = transforms.Compose(im_transforms.transforms[:2]) self.tail_transforms = transforms.Compose(im_transforms.transforms[2:]) self.transforms = im_transforms self.preload = preload self.aug = None self.seg_type = 'baselines' # built text transformations if normalization: self.text_transforms.append( lambda x: unicodedata.normalize(cast(str, normalization), x)) if whitespace_normalization: self.text_transforms.append( lambda x: regex.sub('\s', ' ', x).strip()) if reorder: self.text_transforms.append(bd.get_display) if augmentation: from albumentations import ( Compose, ToFloat, FromFloat, Flip, OneOf, MotionBlur, MedianBlur, Blur, ShiftScaleRotate, OpticalDistortion, ElasticTransform, RandomBrightnessContrast, ) self.aug = Compose([ ToFloat(), OneOf([ MotionBlur(p=0.2), MedianBlur(blur_limit=3, p=0.1), Blur(blur_limit=3, p=0.1), ], p=0.2), ShiftScaleRotate( shift_limit=0.0625, scale_limit=0.2, rotate_limit=3, p=0.2), OneOf([ OpticalDistortion(p=0.3), ElasticTransform(p=0.1), ], p=0.2), ], p=0.5) self.im_mode = '1' def add(self, image: Union[str, Image.Image], text: str, baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]], *args, **kwargs): """ Adds a line to the dataset. Args: im (path): Path to the whole page image text (str): Transcription of the line. baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]]. boundary (list): A polygon mask for the line. """ for func in self.text_transforms: text = func(text) if not text: raise KrakenInputException( 'Text line is empty after transformations') if self.preload: if not isinstance(image, Image.Image): im = Image.open(image) im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': baseline, 'boundary': boundary }] })) try: im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( 'Image transforms failed on {}'.format(image)) self._images.append(im) else: self._images.append((image, baseline, boundary)) self._gt.append(text) self.alphabet.update(text) def encode(self, codec: Optional[PytorchCodec] = None) -> None: """ Adds a codec to the dataset and encodes all text lines. Has to be run before sampling from the dataset. """ if codec: self.codec = codec else: self.codec = PytorchCodec(''.join(self.alphabet.keys())) self.training_set = [ ] # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, self.codec.encode(gt))) def no_encode(self) -> None: """ Creates an unencoded dataset. """ self.training_set = [ ] # type: List[Tuple[Union[Image, torch.Tensor], str]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, gt)) def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: x = x.permute((1, 2, 0)).numpy() o = self.aug(image=x) x = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': x, 'target': y} else: item = self.training_set[index] try: logger.debug('Attempting to load {}'.format(item[0])) im = item[0][0] if not isinstance(im, Image.Image): im = Image.open(im) im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': item[0][1], 'boundary': item[0][2] }] })) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug('Failed. Replacing with sample {}'.format(idx)) return self[np.random.randint(0, len(self.training_set))] def __len__(self) -> int: return len(self.training_set)
class GroundTruthDataset(Dataset): """ Dataset for ground truth used during training. All data is cached in memory. """ def __init__(self, split: Callable[[str], str] = lambda x: os.path.splitext(x)[0], suffix: str = '.gt.txt', normalization: Optional[str] = None, reorder: bool = True, im_transforms: Callable[[Any], torch.Tensor] = transforms.Compose([]), preload: bool = True) -> None: """ Reads a list of image-text pairs and creates a ground truth set. Args: split (func): Function for generating the base name without extensions from paths suffix (str): Suffix to attach to image base name for text retrieval mode (str): Image color space. Either RGB (color) or L (grayscale/bw). Only L is compatible with vertical scaling/dewarping. scale (int, tuple): Target height or (width, height) of dewarped line images. Vertical-only scaling is through CenterLineNormalizer, resizing with Lanczos interpolation. Set to 0 to disable. normalization (str): Unicode normalization for gt reorder (bool): Whether to rearrange code points in "display"/LTR order im_transforms (func): Function taking an PIL.Image and returning a tensor suitable for forward passes. preload (bool): Enables preloading and preprocessing of image files. """ self.suffix = suffix self.split = lambda x: split(x) + self.suffix self._images = [] # type: Union[List[Image], List[torch.Tensor]] self._gt = [] # type: List[str] self.alphabet = Counter() # type: Counter self.text_transforms = [] # type: List[Callable[[str], str]] self.transforms = im_transforms self.preload = preload # built text transformations if normalization: self.text_transforms.append(lambda x: unicodedata.normalize(cast(str, normalization), x)) if reorder: self.text_transforms.append(bd.get_display) def add(self, image: str) -> None: """ Adds a line-image-text pair to the dataset. Args: image (str): Input image path """ with open(self.split(image), 'r', encoding='utf-8') as fp: gt = fp.read().strip('\n\r') for func in self.text_transforms: gt = func(gt) if not gt: raise KrakenInputException('Text line is empty ({})'.format(fp.name)) if self.preload: im = Image.open(image) try: im = self.transforms(im) except ValueError as e: raise KrakenInputException('Image transforms failed on {}'.format(image)) self._images.append(im) else: self._images.append(image) self._gt.append(gt) self.alphabet.update(gt) def encode(self, codec: Optional[PytorchCodec] = None) -> None: """ Adds a codec to the dataset and encodes all text lines. Has to be run before sampling from the dataset. """ if codec: self.codec = codec else: self.codec = PytorchCodec(''.join(self.alphabet.keys())) self.training_set = [] # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]] for im, gt in zip(self._images, self._gt): self.training_set.append((im, self.codec.encode(gt))) def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: return self.training_set[index] else: item = self.training_set[index] try: logger.debug('Attempting to load {}'.format(item[0])) return (self.transforms(Image.open(item[0])), item[1]) except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug('Failed. Replacing with sample {}'.format(idx)) return self[np.random.randint(0, len(self.training_set))] def __len__(self) -> int: return len(self.training_set)