class RegionVisualizer(object): def __init__(self, config={}): self.config = Configuration(config, DEFAULT_CONFIG) def __call__(self, image, regions, is_gt=False): for region in regions: self._viz_region(image, region, is_gt) return image def _draw_lines(self, image, region, color): if (len(region.path) > 0): if not self.config["filled"]: cv2.polylines(image, [np.array(region.path)], 1, color) else: cv2.fillPoly(image, [np.array(region.path)], color) else: cv2.rectangle(image, region.pos, region.get_bottom_right(), color, 1 if not self.config["filled"] else -1) def _color(self, region, is_gt=False): if is_gt: return (0, 255, 0) return (255, 0, 0) if region.cls is not None and region.cls == 0 else (0, 0, 255) def _draw_text(self, image, region, color): if region.text is not None and (region.cls is None or region.cls == 1) and self.config.default( "text", True): x, y = region.pos scale = 2 if self.config["large"] else 1 thickness = 2 if self.config["large"] else 1 reloc = 5 * scale # place text below if there is not enough space above y = y + reloc + region.size[1] if y - (20 + reloc) < 0 else y - reloc cv2.putText(image, region.text, (x, y), cv2.FONT_HERSHEY_PLAIN, scale, color, thickness) def _viz_region(self, image, region, is_gt=False): color = self._color(region, is_gt) self._draw_lines(image, region, color) self._draw_text(image, region, color) def store(self, vizimage, original_file): if self.config.default("store", False): os.makedirs(self.config["store"], exist_ok=True) filename = os.path.basename(original_file) cv2.imwrite(os.path.join(self.config["store"], filename), vizimage)
class Layer(object): __metaclass__ = abc.ABCMeta def __init__(self, config, defaults, data_format='nhwc'): self._config = Configuration(config) self._defaults = Configuration(defaults) self._format = data_format def __getitem__(self, key): default = self._defaults.default(key, None) return self._config.default(key, default) def _parse_format(self): return 'channels_first' if self._format == 'nchw' else 'channels_last' @abc.abstractmethod def __call__(self, x, is_train): pass
class SeparatedVisualizer(object): def __init__(self, config={}): self.config = Configuration(config, DEFAULT_CONFIG) def __call__(self, original, merged, is_gt=False): if len(original.shape) > 2 and original.shape[2] == 3: original = cv2.cvtColor(original, cv2.COLOR_BGR2GRAY) return np.concatenate((original, merged), axis=1) def store(self, vizimage, original_file): if self.config.default("store", False): os.makedirs(self.config["store"], exist_ok=True) filename = os.path.basename(original_file) cv2.imwrite(os.path.join(self.config["store"], filename), vizimage) def store(self, vizimage, original_file): if self.config.default("store", False): os.makedirs(self.config["store"], exist_ok=True) filename = os.path.basename(original_file) cv2.imwrite(os.path.join(self.config["store"], filename), vizimage)
class AlgorithmBase(object): __metaclass__ = abc.ABCMeta _cpu = False def set_cpu(self, is_cpu): self._cpu = is_cpu def __init__(self, config, defaults): self._config = Configuration(config) self._defaults = Configuration(defaults) def __getitem__(self, key): default = self._defaults.default(key, None) return self._config.default(key, default) @abc.abstractmethod def build_graph(): pass
class ImageVisualizer(object): def __init__(self, config={}): self.config = Configuration(config, DEFAULT_CONFIG) def __call__(self, original, merged, is_gt=False): return merged def store(self, vizimage, original_file): if self.config.default("store", False): os.makedirs(self.config["store"], exist_ok=True) filename = os.path.basename(original_file) cv2.imwrite(os.path.join(self.config["store"], filename), vizimage)
class PaperNoteWords(Dataset): def __init__(self, **kwargs): self.paper_note_path = kwargs.get('paper_note_path', '../paper-notes/data/words') self.meta = Configuration(kwargs.get('meta', {})) self.data_config = Configuration(kwargs.get('data_config', {})) self.vocab = kwargs.get('vocab', {}) self.pure = kwargs.get('pure', True) self.max_length = kwargs.get('max_length') self._load_data() self._compile_sets() self.augmenter = ImageAugmenter(self.data_config) def info(self): pass def _compile_set(self, dataset): for item in self.data[dataset]: item['compiled'] = self.compile(item['truth']) def _compile_sets(self): self._compile_set("train") self._compile_set("dev") self._compile_set("test") def _load_data(self): prefix = "pure_" if self.pure else "" self.data = { "dev": self._load_wordlist("{}dev".format(prefix)), "train": self._load_wordlist("{}train".format(prefix)), "test": self._load_wordlist("{}test".format(prefix)), "print_dev": self._load_classlist("dev"), "print_test": self._load_classlist("test"), "print_train": self._load_classlist("train"), } def _load_wordlist(self, subset): basepath = os.path.join(self.paper_note_path, subset) words = util.loadJson(basepath, "words") parsed = [] for word in words: parsed.append( self._fileobj(basepath, "{}.png".format(word), words[word])) return parsed def _load_classlist(self, subset): files = self._load_filelist(subset, 1) files.extend( self._load_filelist("print_{}".format(subset), 0, len(files))) return files def _load_filelist(self, subset, is_htr, length=None) -> list: basepath = os.path.join(self.paper_note_path, subset) if os.path.exists(basepath): all_files = os.listdir(basepath) shuffle(all_files) length = len(all_files) if length is None else min( length, len(all_files)) files = list( filter(lambda x: x.endswith(".png"), all_files[:length])) return list( map(lambda x: self._fileobj(basepath, x, is_htr), files)) return [] def _fileobj(self, basepath: str, filename: str, truth): return { "path": os.path.join(basepath, filename), "truth": truth, } def compile(self, text): parsed = [self.vocab[1][c] for c in text] parsed.extend([-1] * (self.max_length - len(text))) return parsed def decompile(self, values): def getKey(key): try: return self.vocab[0][str(key)] except KeyError: return '' return ''.join([getKey(c) for c in values]) def getBatchCount(self, batch_size, max_batches=0, dataset="train"): total_len = len(self.data[dataset]) num_batches = int(math.ceil(float(total_len) / batch_size)) return min(num_batches, max_batches) if max_batches > 0 else num_batches def generateBatch(self, batch_size, max_batches=0, dataset="train", with_filepath=False, augmentable=False): num_batches = self.getBatchCount(batch_size, max_batches, dataset) if self.data_config.default('shuffle_epoch', False): shuffle(self.data[dataset]) for b in range(num_batches): yield self._load_batch(b, batch_size, dataset, with_filepath, augmentable=augmentable) pass def load_image(self, path, transpose=False, augmentable=False): target_size = ( int(self.meta["height"] - (self.data_config.default('preprocess.padding', 0) * 2)), int(self.meta["width"] - (self.data_config.default('preprocess.padding', 0) * 2))) x = cv2.imread(path, cv2.IMREAD_GRAYSCALE) if x is None or x.shape[0] == 0 or x.shape[1] == 0: return None x = self.augmenter.preprocess(x, target_size) if x is None: return None if self.data_config.default("otf_augmentations", False) and augmentable: x = self.augmenter.augment(x) else: x = self.augmenter.add_graychannel(x) if x.shape[1] != self.meta["width"] or x.shape[0] != self.meta[ "height"]: x = self.augmenter.pad_to_size(x, width=self.meta["width"], height=self.meta["height"]) return self.augmenter.add_graychannel(x) def _loadline(self, line, transpose=True, augmentable=False): l = len(line["truth"]) y = np.asarray(line["compiled"]) x = self.load_image(line["path"], augmentable=augmentable) return x, y, l, line["path"] def _loadprintline(self, line, transpose=True, augmentable=False): y = line["truth"] x = self.load_image(line["path"], augmentable=augmentable) return x, [y], 0, line["path"] def _load_batch(self, index, batch_size, dataset, with_filepath=False, augmentable=False): X = [] Y = [] L = [] F = [] parseline = self._loadline if not dataset.startswith( "print_") else self._loadprintline for idx in range( index * batch_size, min((index + 1) * batch_size, len(self.data[dataset]))): x, y, l, f = parseline(self.data[dataset][idx], augmentable=augmentable) if x is not None: X.append(x) Y.append(y) L.append(l) F.append(f) X = np.asarray(X) Y = np.asarray(Y) L = np.asarray(L) if not with_filepath: return X, Y, L else: return X, Y, L, F # deprecated def generateEpochs(self, batch_size, num_epochs, max_batches=0, dataset="train", with_filepath=False, augmentable=False): for e in range(num_epochs): yield self.generateBatch(batch_size, max_batches=max_batches, dataset=dataset, with_filepath=with_filepath, augmentable=augmentable)
class E2ERunner(object): def __init__(self, config={}, globalConfig={}): self.config = Configuration(config) self.globalConfig = Configuration(globalConfig) self._parse_config() self.logger = Logger() self.config() def _parse_config(self): self._parse_blocks(self.config["blocks"]) self.viz = self._parse_visualizer(self.config.default("viz", None)) self.gtprov = self._parse_gt(self.config.default("gt", None)) self.evals = self._parse_evals(self.config.default('eval', [])) def _parse_blocks(self, blocks): self.blocks = [ self._parse_block(block) for block in blocks if "disabled" not in block or not block["disabled"] ] def _parse_block(self, block): if block["type"] == "TextSeparation": return TextSeparation(self.globalConfig, block) elif block["type"] == "WordDetection": return WordDetection(block) elif block["type"] == "LineSegmentation": return LineSegmentation(block) elif block["type"] == "ParagraphDetection": return ParagraphDetection(block) elif block["type"] == "UnigramLanguageModel": return UnigramLanguageModel(block) elif block["type"] == "Ceiling": return Ceiling(block) elif block["type"] == "TranscriptionAndClassification": return TranscriptionAndClassification(self.globalConfig, block) def _parse_evals(self, eval_configs): return [self._parse_eval(config) for config in eval_configs] def _parse_eval(self, config): if config is None: return None if config["type"] == "IoU": return IoU(config) elif config["type"] == "IoUPixelSum": return IoUPixelSum(config) elif config["type"] == "BagOfWords": return BagOfWords(config) elif config["type"] == "IoUCER": return IoUCER(config) def _parse_data(self, data_config): if isinstance(data_config, list): return data_config else: prefix = data_config["prefix"] if "prefix" in data_config else "" filenames = list( filter( lambda f: f.endswith(data_config["suffix"]) and f. startswith(prefix), os.listdir(data_config["path"]))) if data_config["limit"] > 0: filenames = filenames[:data_config["limit"]] return [ os.path.join(data_config["path"], filename) for filename in filenames ] def _parse_visualizer(self, viz_config): if viz_config is None: return None if viz_config["type"] == "RegionVisualizer": return RegionVisualizer(viz_config) elif viz_config["type"] == "ImageVisualizer": return ImageVisualizer(viz_config) elif viz_config["type"] == "SeparatedVisualizer": return SeparatedVisualizer(viz_config) def _parse_gt(self, gt_config): if gt_config is None: return None if gt_config["type"] == "WordRegion": return WordRegionGTProvider() elif gt_config["type"] == "ParagraphRegion": return ParagraphRegionGTProvider() elif gt_config["type"] == "LineRegion": return LineRegionGTProvider() def __call__(self, log_prefix="E2E", skip_range_evaluation=False): if not skip_range_evaluation and self.config.default("ranger", False): self.logger.write("Entering Range Execution Mode") return self._range_exec() start = time() self.scores = {} data = self._parse_data(self.config["data"]) results = [] times = [] for idx, file in enumerate(data): file_time = time() self.logger.progress(log_prefix, idx, len(data)) results.append(self._exec(file)) times.append(time() - file_time) [block.close() for block in self.blocks] if len(self.evals) > 0: final_scores = { "time": time() - start, "median time": np.median(times), "avg time": np.average(times) } for score_key in self.scores: final_scores[score_key] = np.average(self.scores[score_key]) self.logger.summary(log_prefix, final_scores) return results def _get_range(self): if type(self.config["ranger.values"]) is dict: return frange(self.config["ranger.values.from"], self.config["ranger.values.to"], self.config["ranger.values.step"]) def _range_exec(self): def set_config(value): for path in self.config.default( "ranger.paths", [self.config.default("ranger.path", [])]): current = self.config for step in path[:-1]: current = current[step] current[path[-1]] = value self._parse_config() for val in self._get_range(): set_config(val) prefix = self.config.default("ranger.template", "value {}") self(log_prefix=prefix.format(val), skip_range_evaluation=True) def _exec(self, file): original = cv2.imread(file) last_output = original.copy() for block in self.blocks: last_output = block(last_output, file) res = {"file": file, "original": original, "result": last_output} if self.gtprov is not None: gt = self.gtprov(file, original) if self.viz is not None: vizimage = res["original"].copy() if self.gtprov is not None and self.config.default( 'gt.viz', False): vizimage = self.viz(vizimage, gt, True) if len(self.blocks) > 0: vizimage = self.viz(vizimage, res["result"], False) self.viz.store(vizimage, file) res["viz"] = vizimage if len(self.evals) > 0: for evl in self.evals: scores = evl(gt, res["result"]) for score_key in scores.keys(): self.scores[score_key] = [ scores[score_key] ] if score_key not in self.scores else [ scores[score_key], *self.scores[score_key] ] return res
class ImageAugmenter(object): def __init__(self, config): self.config = Configuration(config) def augment(self, img, get_settings=False): augmentation_settings = {} if "warp" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.warp.prob']: if(not self.config.default('preprocess.invert', False)): img = 255 - img reshaped = False if len(img.shape) > 2: reshaped = True img = np.reshape(img, (img.shape[0], img.shape[1])) img = convert._cv2pil(img) img, mat = warp._warp( img, gridsize=self.config['otf_augmentations.warp.gridsize'], deviation=self.config['otf_augmentations.warp.deviation'], return_mat=True) augmentation_settings["warp"] = { "gridsize": self.config['otf_augmentations.warp.gridsize'], "mat": mat } img = convert._pil2cv2(img) if reshaped: img = np.reshape(img, (img.shape[0], img.shape[1], 1)) if(not self.config.default('preprocess.invert', False)): img = 255 - img if "affine" in self.config["otf_augmentations"]: if(self.config.default('preprocess.invert', False)): img = 255 - img img, mat = affine._affine( img, self.config["otf_augmentations.affine"], return_mat=True) augmentation_settings["affine"] = { "mat": mat } if(self.config.default('preprocess.invert', False)): img = 255 - img if "morph" in self.config["otf_augmentations"]: img, op_name, op_values = morph._random_morph( img, self.config["otf_augmentations.morph"], self.config.default('preprocess.invert', False), True) augmentation_settings["affine"] = { "op_name": op_name, "op_values": op_values } if "binarize" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.binarize.prob']: img = binarize._binarize(img) augmentation_settings["binarize"] = {} if "blur" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.blur.prob']: img = cv2.GaussianBlur( img, tuple(self.config['otf_augmentations.blur.kernel']), self.config['otf_augmentations.blur.sigma']) augmentation_settings["blur"] = { "kernel": self.config['otf_augmentations.blur.kernel'], "sigma": self.config['otf_augmentations.blur.sigma'] } if "sharpen" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.sharpen.prob']: img = self._unsharp_mask_filter( img, tuple(self.config['otf_augmentations.sharpen.kernel']), self.config['otf_augmentations.sharpen.sigma']) augmentation_settings["sharpen"] = { "kernel": self.config['otf_augmentations.sharpen.kernel'], "sigma": self.config['otf_augmentations.sharpen.sigma'] } if "brighten" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.brighten.prob']: factor = np.random.normal( self.config['otf_augmentations.brighten.center'], self.config['otf_augmentations.brighten.stdv']) factor = factor if factor >= 1 else 1 img = np.uint8(np.clip(img * factor, 0, 255)) augmentation_settings["brighten"] = { "factor": factor } if "darken" in self.config["otf_augmentations"]: if np.random.uniform() < self.config['otf_augmentations.darken.prob']: factor = np.random.normal( self.config['otf_augmentations.darken.center'], self.config['otf_augmentations.darken.stdv']) factor = factor if factor >= 1 else 1 img = 255 - np.uint8(np.clip((255 - img) * factor, 0.0, 255.0)) augmentation_settings["darken"] = { "factor": factor } if not get_settings: return self.add_graychannel(img) else: return self.add_graychannel(img), Configuration(augmentation_settings) def binarization(self, img): if(self.config.default('preprocess.invert', False)): img = 255 - img _, img = cv2.threshold(img, 200, 255, cv2.THRESH_BINARY) if(self.config.default('preprocess.invert', False)): img = 255 - img return self.add_graychannel(img) def apply_augmentation(self, img, settings): if settings.default("warp", False): if(not self.config.default('preprocess.invert', False)): img = 255 - img reshaped = False if len(img.shape) > 2: reshaped = True img = np.reshape(img, (img.shape[0], img.shape[1])) img = convert._cv2pil(img) img = warp._warp( img, gridsize=settings['warp.gridsize'], mat=settings['warp.mat']) img = convert._pil2cv2(img) if reshaped: img = np.reshape(img, (img.shape[0], img.shape[1], 1)) if(not self.config.default('preprocess.invert', False)): img = 255 - img if settings.default("affine", False): img = affine._affine( img, mat=settings["affine.mat"], background=255.0) if settings.default("morph", False): img = morph._morph(img, settings['morph.op_name'], settings['morph.op_values'], self.config.default( 'preprocess.invert', False)) if settings.default("binarize", False): img = binarize._binarize(img) if settings.default("blur", False): img = cv2.GaussianBlur( img, tuple(settings['blur.kernel']), settings['blur.sigma']) if settings.default("sharpen", False): img = self._unsharp_mask_filter( img, tuple(settings['sharpen.kernel']), settings['sharpen.sigma']) if settings.default("brighten", False): img = np.uint8( np.clip(img * settings["brighten.factor"], 0.0, 255.0)) if settings.default("darken", False): img = 255 - np.uint8( np.clip((255 - img) * settings["darken.factor"], 0.0, 255.0)) return self.add_graychannel(img) def _unsharp_mask_filter(self, image, kernel, sigma): gaussian_3 = cv2.GaussianBlur(image, kernel, sigma) return cv2.addWeighted(image, 1.5, gaussian_3, -0.5, 0, image) def add_graychannel(self, img): if len(img.shape) == 2: return np.reshape(img, [img.shape[0], img.shape[1], 1]) return img def pad_to_size(self, img, height, width): return self._pad(img, (height, width, 1)) def _scale(self, img, factor, target_size=None): height = int(img.shape[0] / factor) width = int(img.shape[1] / factor) if width <= 0 or height <= 0: return None return cv2.resize(img, (width, height)) def _scale_img(self, img, scale_factor, target_size=None): if img.shape[0] == 0 or img.shape[1] == 0: return None factor = max(img.shape[0] / target_size[0], img.shape[1] / target_size[1], scale_factor) img = self._scale(img, factor) return img def preprocess(self, img, target_size=None): bg = 255 if self.config.default('preprocess.invert', False): img = invert._invert(img) bg = 255 - bg if self.config.default('preprocess.crop', False): if img.shape[0] == 0 or img.shape[1] == 0: return None img = crop._crop(img) if img is None: return None if self.config.default('preprocess.scale', False): img = self._scale_img( img, self.config['preprocess.scale'], target_size) if img is None: return None if self.config.default('preprocess.padding', False): img = padding._pad_cv2(img, self.config['preprocess.padding'], bg) img = self.add_graychannel(img) if target_size != None: target_size = ( target_size[0] + (self.config.default('preprocess.padding', 0)*2), target_size[1] + (self.config.default('preprocess.padding', 0)*2), 1 ) img = self._pad(img, target_size) return img def postprocesss(self, img): if self.config.default('postprocess.binarize', False): img = self.binarization(img) return img def _pad(self, array, reference_shape, offsets=None): """ array: Array to be padded reference_shape: tuple of size of ndarray to create offsets: list of offsets (number of elements must be equal to the dimension of the array) will throw a ValueError if offsets is too big and the reference_shape cannot handle the offsets """ offsets = offsets if offsets is not None else [ 0] * len(array.shape) # Create an array of zeros with the reference shape result = np.zeros(reference_shape) # Create a list of slices from offset to offset + shape in each dimension insertHere = [slice(offsets[dim], offsets[dim] + array.shape[dim]) for dim in range(array.ndim)] # Insert the array in the result at the specified offsets result[tuple(insertHere)] = array return result
class RegionDataset(Dataset): def __init__(self, regions, model_path, data_config={}): self.model_path = model_path self._load_vocab() self._load_meta() self._scaling = 1.0 self._max_height = 10000 self._max_width = 10000 self.set_regions(regions) self.data_config = Configuration(data_config, DEFAULT_DATACONFIG) self.augmenter = ImageAugmenter(self.data_config) def info(self): self.meta('Dataset Configuration') def scaling(self, scaling, max_height, max_width): self.augmenter.config['preprocess.scale'] = scaling self._max_height = max_height self._max_width = max_width def _load_meta(self): self.meta = Configuration(util.loadJson(self.model_path, "data_meta")) def _load_vocab(self): self.vocab = util.loadJson(self.model_path, "vocab") self.vocab_length = len(self.vocab[0]) def _load_sets(self): self.data = np.asarray( list( filter(lambda x: x is not None, [self._loadimage(region) for region in self.regions]))) def _loadimage(self, region): if region.img.shape[0] == 0 or region.img.shape[1] == 0: img = np.zeros((self.meta["height"], self.meta["width"])) elif len(region.img.shape) > 2: img = cv2.cvtColor(region.img, cv2.COLOR_BGR2GRAY) else: img = region.img target_size = ( int(self.meta["height"] - (self.data_config.default('preprocess.padding', 0) * 2)), int(self.meta["width"] - (self.data_config.default('preprocess.padding', 0) * 2))) img = self.augmenter.preprocess(img, target_size) if img is not None: img = self.augmenter.postprocesss(img) if img is None: img = np.zeros((self.meta["height"], self.meta["width"])) return self.augmenter.add_graychannel(img) def set_regions(self, regions): self.regions = regions if regions is not None: self._load_sets() def compile(self, text): parsed = [self.vocab[1][c] for c in text] parsed.extend([-1] * (self.max_length - len(text))) return parsed def decompile(self, values): def getKey(key): try: return self.vocab[0][str(key)] except KeyError: return '' return ''.join([getKey(c) for c in values]) def _load_batch(self, index, batch_size, dataset, with_filepath=False): batch_data = np.asarray( self.data[index * batch_size:min((index + 1) * batch_size, len(self.data))]) if with_filepath: return batch_data, [], [], [] else: return batch_data, [], [] def generateBatch(self, batch_size=0, max_batches=0, dataset="", with_filepath=False): num_batches = self.getBatchCount(batch_size, max_batches, "") for b in range(num_batches): yield self._load_batch(b, batch_size, "", with_filepath) pass def generateEpochs(self, batch_size, num_epochs, max_batches=0, dataset="train", with_filepath=False): return [self.generateBatch()] def getBatchCount(self, batch_size, max_batches=0, dataset=""): return int(np.ceil(len(self.data) / float(batch_size)))
class PrintGenerator(object): FILTERS = { 'blur': lambda i, c: i.filter(ImageFilter.GaussianBlur(c['radius'])), 'sharpen': lambda i, c: i.filter( ImageFilter.UnsharpMask(c['radius'], c['percent'], c['threshold']) ), 'warp': lambda i, c: _warp(i, c['grid'], c['deviation']), 'affine': lambda i, c: PrintGenerator._affine_filter(i, c['config']) } def __init__(self, config={}): self.config = Configuration(config) self.default = Configuration(DEFAULTS) self.max_size = (0, 0) self.max_height = -1 self.max_width = -1 def __getitem__(self, key): default = self.default.default(key, None) return self.config.default(key, default) def _random_font(self): return self['fonts'][np.random.randint(0, len(self['fonts']))] def _random_height(self): return max( min( int( np.random.normal(self['height.center', True], self['height.scale', True])), self['height.max', True]), self['height.min', True]) def _random_foreground(self): if self['foreground.low'] == self['foreground.high']: return self['foreground.low'] return np.random.randint(self['foreground.low'], self['foreground.high']) def _iterate_height(self, text, fontname, height): font = ImageFont.truetype(fontname, size=height) size, offset = font.font.getsize(text) image_size = (size[0] + offset[0] + self['printing_padding'] * 2, size[1] + offset[1] + self['printing_padding'] * 2) if self.max_height > -1 and image_size[1] > self.max_height: height = int(height * (self.max_height / float(image_size[1]))) return self._iterate_height(text, fontname, height) elif self.max_width > -1 and image_size[0] > self.max_width: height = int(height * (self.max_width / float(image_size[0]))) return self._iterate_height(text, fontname, height) else: return font, offset, image_size def _create_text_image(self, text, font, height, background, foreground): font, offset, image_size = self._iterate_height(text, font, height) self.max_size = np.max([self.max_size, image_size], axis=0) image = Image.new("L", image_size, background) draw = ImageDraw.Draw(image) draw.text((self['printing_padding'], -(offset[1] / 2) + self['printing_padding']), text, font=font, fill=foreground) return image def _apply_filter(self, image, filter_config): if filter_config['prob'] > np.random.rand(): image = self.FILTERS[filter_config['type']](image, filter_config) return image def _apply_filters(self, image): for _filter in self['filters']: image = self._apply_filter(image, _filter) return image def _crop(self, image, invert): if image is None: return None if self['crop']: image = _pil2cv2(image) if not invert: image = 255 - image image = _crop(image) if not invert: image = 255 - image image = _cv2pil(image) return image def _pad(self, image, background): if self['padding'] > 0: image = _pad_pil(image, self['padding'], background) return image def __call__(self, text, invert=False): foreground = self._random_foreground() background = self['background'] if invert: foreground = 255 - foreground background = 255 - background font = self._random_font() height = self._random_height() image = self._create_text_image(text, font, height, background, foreground) image = self._apply_filters(image) image = self._crop(image, invert) if image is None: return None image = self._pad(image, invert) return image @staticmethod def clean_text(text): text = PUNCTUATION_REGEX.sub('', text) text = REGULAR_REGEX.sub(' ', text) return text @staticmethod def _affine_filter(image, config): image = _pil2cv2(image) image = _affine(image, config) return _cv2pil(image)
class PageHandwritingBlender(object): DEFAULTS = Configuration({ "background": 255, "augmentation": { "line": { "scale": { "prob": 1.0, "center": -.25, "stdv": 0.15 } }, "page": [{ 'type': 'blur', 'prob': 0.5, 'kernel': (3, 3), 'sigma': 1 }, { 'type': 'sharpen', 'prob': 0.5, 'kernel': (3, 3), 'sigma': 1 }, { 'type': 'warp', 'prob': 0.5, 'config': { 'deviation': 2.7, 'gridsize': [100, 30] } }] }, "filters": { 'blur': lambda i, c: cv2.GaussianBlur(i, c['kernel'], c['sigma']), 'sharpen': lambda i, c: PageHandwritingBlender._unsharp_mask_filter( i, c['kernel'], c['sigma']), 'warp': lambda i, c: PageHandwritingBlender._warp_filter(i, c['config']), 'affine': lambda i, c: PageHandwritingBlender._affine_filter(i, c['config']) } }) ################################# # PUBLIC METHODS ############################### def __init__(self, page, config={}): self.page = page self.config = Configuration(config) self.truth = np.full(page.shape, self['background']) self._augment_page() def __call__(self, line): line = self._augment_line(line) h, w, _ = line.shape x, y = self._random_position(h, w) self._insert(line, x, y) def save(self, pagefile, truthfile): cv2.imwrite(pagefile, self.page) cv2.imwrite(truthfile, self.truth) def __getitem__(self, key): return self.config.default(key, self.DEFAULTS.default(key, None)) ############################ # PRIVATE METHODS ################################ def _random_position(self, h, w): ph, pw, pc = self.page.shape def rand(mx): # loc = np.random.uniform(0, 1) # x = abs(np.random.normal(0.0, mx/15.0)) # return int(x if loc < 0.5 else mx - x) x = np.random.uniform(0, mx) return int(x) return rand(pw - w), rand(ph - h) def _insert(self, line, x, y): ph, pw, pc = self.page.shape lh, lw, lc = line.shape off_x = x if lw + x <= pw else x - (lw + x - pw) off_y = y if lh + y <= ph else y - (lh + y - ph) self.page[off_y:off_y + lh, off_x:off_x + lw, :] &= line self.truth[off_y:off_y + lh, off_x:off_x + lw, :] &= line def _augment_line(self, line): line = cv2.cvtColor(line, cv2.COLOR_BGR2GRAY) line = _threshold(line, False) line = cv2.cvtColor(line, cv2.COLOR_GRAY2BGR) at = AffineTransformation(line) at.configure(self['augmentation.line']) return at(background=[self['background']] * 3) def _augment_page(self): for _filter in self['augmentation.page']: if _filter['prob'] > np.random.rand(): self.page = self['filters'][_filter['type']](self.page, _filter) ####################################### # STATIC METHODS ####################################### @staticmethod def _affine_filter(image, config): at = AffineTransformation(image) at.configure(config) return at() @staticmethod def _warp_filter(image, config): image = _cv2pil(image, 'RGB') image = _warp(image, config['gridsize'], config['deviation']) return _pil2cv2(image, 'RGB') @staticmethod def _unsharp_mask_filter(image, kernel, sigma): gaussian_3 = cv2.GaussianBlur(image, kernel, sigma) return cv2.addWeighted(image, 1.5, gaussian_3, -0.5, 0, image)
class PreparedDataset(Dataset): def __init__(self, name, transpose=True, data_config={}): self.name = name self.data_config = Configuration(data_config) self.min_width_factor = 15 self.max_min_width = 400 self.datapath = os.path.join(util.OUTPUT_PATH, name) self._load_vocab() self._load_meta() self._load_sets() self._calc_max_length() self._compile_sets() self.transpose = transpose self.channels = 1 self._fill_meta() self.augmenter = ImageAugmenter(self.data_config) self.unfiltered = {} def load_vocab(self, path): self._load_vocab(path) self._compile_sets() self._fill_meta() def info(self): self.meta('Dataset Configuration') def _load_meta(self): self.meta = Configuration(util.loadJson(self.datapath, "meta")) def _load_vocab(self, path=None): path = path or self.datapath self.vocab = util.loadJson(path, "vocab") self.vocab_length = len(self.vocab[0]) def _fill_meta(self): self.meta['vocab.size'] = self.vocab_length self.meta['train.count'] = len(self.data['train']) self.meta['train.count'] = len(self.data['train']) self.meta['dev.count'] = len(self.data['dev']) self.meta['test.count'] = len(self.data['test']) if 'print_train' in self.data: self.meta['print_train.count'] = len(self.data['print_train']) self.meta['print_dev.count'] = len(self.data['print_dev']) self.meta['print_test.count'] = len(self.data['print_test']) def _load_sets(self): self.data = { "train": util.loadJson(self.datapath, "train"), "dev": util.loadJson(self.datapath, "dev"), "test": util.loadJson(self.datapath, "test") } if self.meta.default('printed', False): self.data['print_train'] = util.loadJson(self.datapath, "print_train") self.data['print_dev'] = util.loadJson(self.datapath, "print_dev") self.data['print_test'] = util.loadJson(self.datapath, "print_test") if self.data_config.default('sort_by_width', False): self._sort_by_width("train") self._sort_by_width("dev") if self.meta.default('printed', False): self._sort_by_width("print_train") self._sort_by_width("print_dev") def _sort_by_width(self, dataset): print("Sorting {} dataset by width...".format(dataset)) for datapoint in self.data[dataset]: img = cv2.imread(datapoint["path"], cv2.IMREAD_GRAYSCALE) datapoint["width"] = img.shape[1] self.data[dataset].sort(key=lambda x: x["width"], reverse=True) def _compile_set(self, dataset): for item in self.data[dataset]: item['compiled'] = self.compile(item['truth']) def _filter_by_type(self, subset): filtered = [] if subset not in self.unfiltered: self.unfiltered[subset] = self.data[subset] for file in self.unfiltered[subset]: if file['type'] in self.data_config['type_probs']: if np.random.uniform() <= self.data_config['type_probs'][ file['type']]: filtered.append(file) else: filtered.append(file) self.data[subset] = filtered def _filter_data(self): if self.data_config.default('type_probs', False): self._filter_by_type('train') if self.meta.default('printed', False): self._filter_by_type('print_train') def _compile_sets(self): self._compile_set("train") self._compile_set("dev") self._compile_set("test") def _calc_max_length(self): _all = [] _all.extend(self.data["train"]) _all.extend(self.data["test"]) _all.extend(self.data["dev"]) self.max_length = max(map(lambda x: len(x["truth"]), _all)) def compile(self, text): parsed = [self.vocab[1][c] for c in text] # if not self.dynamic_width: parsed.extend([-1] * (self.max_length - len(text))) return parsed def decompile(self, values): def getKey(key): try: return self.vocab[0][str(key)] except KeyError: return '' return ''.join([getKey(c) for c in values]) def load_image(self, path, transpose=False, augmentable=False): x = cv2.imread(path, cv2.IMREAD_GRAYSCALE) if self.data_config.default("otf_augmentations", False) and augmentable: x = self.augmenter.augment(x) else: x = self.augmenter.add_graychannel(x) if transpose: try: x = np.transpose(x, [1, 0]) if self.data_config.default('dynamic_width', False): return self.augmenter.add_graychannel(x) except ValueError: return None, None, None, None if x.shape[0] != self.meta["width"] or x.shape[1] != self.meta[ "height"]: x = self.augmenter.pad_to_size(x, width=self.meta["width"], height=self.meta["height"]) return self.augmenter.add_graychannel(x) else: if self.data_config.default('dynamic_width', False): return self.augmenter.add_graychannel(x) if x.shape[1] != self.meta["width"] or x.shape[0] != self.meta[ "height"]: x = self.augmenter.pad_to_size(x, width=self.meta["width"], height=self.meta["height"]) return self.augmenter.add_graychannel(x) return x def _loadline(self, line, transpose=True, augmentable=False): l = len(line["truth"]) y = np.asarray(line["compiled"]) x = self.load_image(line["path"], augmentable=augmentable) return self.augmenter.postprocesss(x), y, l, line["path"] def _loadprintline(self, line, transpose=True, augmentable=False): y = line["truth"] x = self.load_image(line["path"], augmentable=augmentable) return self.augmenter.postprocesss(x), [y], 0, line["path"] def _load_batch(self, index, batch_size, dataset, with_filepath=False, augmentable=False): X = [] Y = [] L = [] F = [] parseline = self._loadline if not dataset.startswith( "print_") else self._loadprintline for idx in range( index * batch_size, min((index + 1) * batch_size, len(self.data[dataset]))): x, y, l, f = parseline(self.data[dataset][idx], self.transpose, augmentable=augmentable) if x is not None: X.append(x) Y.append(y) L.append(l) F.append(f) if self.data_config.default('dynamic_width', False): L = np.asarray(L) + 5 batch_width = np.max(list(map(lambda _x: _x.shape[1], X))) if batch_width < self.max_min_width: batch_width = max(batch_width, np.max(L) * self.min_width_factor) X_ = np.zeros((len(X), self.meta["height"], batch_width, 1), dtype=np.int32) for idx in range(len(X)): X_[idx, 0:X[idx].shape[0], 0:X[idx].shape[1], :] = X[idx] X = X_ Y = np.asarray(Y) else: X = np.asarray(X) Y = np.asarray(Y) L = np.asarray(L) if not with_filepath: return X, Y, L else: return X, Y, L, F def before_epoch(self, subset): if self.data_config.default('shuffle_epoch', False): if subset in self.unfiltered: shuffle(self.unfiltered[subset]) else: shuffle(self.data[subset]) self._filter_data() def generateBatch(self, batch_size, max_batches=0, dataset="train", with_filepath=False, augmentable=False): num_batches = self.getBatchCount(batch_size, max_batches, dataset) for b in range(num_batches): yield self._load_batch(b, batch_size, dataset, with_filepath, augmentable=augmentable) pass # deprecated def generateEpochs(self, batch_size, num_epochs, max_batches=0, dataset="train", with_filepath=False, augmentable=False): for e in range(num_epochs): yield self.generateBatch(batch_size, max_batches=max_batches, dataset=dataset, with_filepath=with_filepath, augmentable=augmentable) def getBatchCount(self, batch_size, max_batches=0, dataset="train"): total_len = len(self.data[dataset]) num_batches = int(math.ceil(float(total_len) / batch_size)) return min(num_batches, max_batches) if max_batches > 0 else num_batches
class Extendable(object): _decoded_dense = None _decoder = None _pred_thresholded = None _cer = None _accuracy = None _tp = None _tn = None _fn = None _fp = None _pred_res = None _y_res = None _sep_acc = None _sep_prec = None _sep_rec = None _sep_f = None def __init__(self, **kwargs): self.config = Configuration(kwargs.get('config', {})) def build_decoded_dense(self, graph): if self._decoded_dense is None: decoded = self.build_decoder(graph) self._decoded_dense = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values, tf.constant(-1, tf.int64)) return self._decoded_dense def build_decoder(self, graph): if self._decoder is None: if self.config['ctc'] == "greedy": self._decoder, _ = tf.nn.ctc_greedy_decoder( graph['logits'], graph['l'], merge_repeated=True) elif self.config['ctc']: self._decoder, _ = tf.nn.ctc_beam_search_decoder( graph['logits'], graph['l'], merge_repeated=True) return self._decoder def build_pred_thresholding(self, graph): if self._pred_thresholded is None: self._pred_thresholded = tf.to_int32( graph['class_pred'] > self.config.default( 'accuracy_threshold', 0.5)) return self._pred_thresholded def build_cer(self, graph): if self._cer is None: decoded = self.build_decoder(graph) self._cer = tf.edit_distance(tf.cast(decoded[0], tf.int32), tf.cast(graph['y'], tf.int32)) return self._cer def build_accuracy(self, graph): if self._accuracy is None: predictions = self.build_pred_thresholding(graph) equality = tf.equal(predictions, tf.cast(graph['class_y'], tf.int32)) self._accuracy = tf.reduce_mean(tf.cast(equality, tf.float32)) return self._accuracy def build_tp(self, graph): if self._tp is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) self._tp = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 0)), 0), tf.float32)) return self._tp def build_fp(self, graph): if self._fp is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) self._fp = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 1)), 0), tf.float32)) return self._fp def build_fn(self, graph): if self._fn is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) self._fn = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 0)), 1), tf.float32)) return self._fn def build_tn(self, graph): if self._tn is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) self._tn = tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 1)), 1), tf.float32) return self._tn def build_sep_accuracy(self, graph): if self._sep_acc is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) self._sep_acc = tf.reduce_mean( tf.cast(tf.equal(pred_res, y_res), tf.float32)) return self._sep_acc def build_sep_recall(self, graph): if self._sep_rec is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) tp = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 0)), 0), tf.float32)) fn = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 0)), 1), tf.float32)) self._sep_rec = tp / (tp + fn) return self._sep_rec def build_sep_precision(self, graph): if self._sep_prec is None: pred_res = self.build_pred_res(graph) y_res = self.build_y_res(graph) tp = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 0)), 0), tf.float32)) fp = tf.reduce_sum( tf.cast( tf.equal(tf.boolean_mask(pred_res, tf.equal(y_res, 1)), 0), tf.float32)) self._sep_prec = tp / (tp + fp) return self._sep_prec def build_sep_fmeasure(self, graph): if self._sep_f is None: prec = self.build_sep_precision(graph) rec = self.build_sep_recall(graph) self._sep_f = (tf.constant(2.0) * prec * rec) / (prec + rec) return self._sep_f def build_pred_res(self, graph): if self._pred_res is None: self._pred_res = tf.argmax(graph['output'], 3) return self._pred_res def build_y_res(self, graph): if self._y_res is None: self._y_res = tf.argmax(graph['y'], 3) return self._y_res