class IamLinesDataset(Dataset): """ "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text, which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database The data split we will use is IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines. The validation set has been merged into the train set. The train set has 7,101 lines from 326 writers. The test set has 1,861 lines from 128 writers. The text lines of all data sets are mutually exclusive, thus each writer has contributed to one set only. Note that we use cachedproperty because data takes time to load. """ def __init__(self): self.mapping = EmnistDataset().mapping self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.num_classes = len(self.mapping) self.input_shape = (28, 952) self.output_shape = (97, self.num_classes) def cleanup(self, labels): labels_clean = [] for label in labels: s = ''.join([self.mapping.get(i, '') for i in label]) t = s.replace('"', '"') t = t + '_' * (self.output_shape[0] - len(t)) labels_clean.append([self.inverse_mapping[c] for c in t]) return np.array(labels_clean) def load_or_generate_data(self): if not PROCESSED_DATA_FILENAME.exists(): PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) print('Downloading IAM lines...') urlretrieve(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME) with h5py.File(PROCESSED_DATA_FILENAME, 'r') as f: self.x_train = f['x_train'][:] self.y_train_int = self.cleanup(f['y_train'][:]) self.x_test = f['x_test'][:] self.y_test_int = self.cleanup(f['y_test'][:]) @cachedproperty def y_train(self): return to_categorical(self.y_train_int, self.num_classes) @cachedproperty def y_test(self): return to_categorical(self.y_test_int, self.num_classes) def __repr__(self): return ('IAM Lines Dataset\n' f'Num classes: {self.num_classes}\n' f'Mapping: {self.mapping}\n' f'Train: {self.x_train.shape} {self.y_train.shape}\n' f'Test: {self.x_test.shape} {self.y_test.shape}\n')
def test_evaluate(self): predictor = CharacterPredictor() dataset = EmnistDataset() dataset.load_or_generate_data() t = time() metric = predictor.evaluate(dataset) time_taken = time() - t print(f'acc: {metric}, time_taken: {time_taken}') self.assertGreater(metric, 0.7) self.assertLess(time_taken, 10)
def create_emnist_support_files(): shutil.rmtree(SUPPORT_DIRNAME, ignore_errors=True) SUPPORT_DIRNAME.mkdir() dataset = EmnistDataset() dataset.load_or_generate_data() for ind in [5, 7, 9]: image = dataset.x_test[ind] label = dataset.mapping[np.argmax(dataset.y_test[ind])] print(ind, label) util.write_image(image, str(SUPPORT_DIRNAME / f'{label}.png'))
def __init__(self, max_length: int = 34, max_overlap: float = 0.33, num_train: int = 10000, num_test: int = 1000): self.emnist = EmnistDataset() self.mapping = self.emnist.mapping self.max_length = max_length self.max_overlap = max_overlap self.num_classes = len(self.mapping) self.input_shape = (self.emnist.input_shape[0], self.emnist.input_shape[1] * self.max_length) self.output_shape = (self.max_length, self.num_classes) self.num_train = num_train self.num_test = num_test
def __init__(self): self.mapping = EmnistDataset().mapping self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.num_classes = len(self.mapping) self.input_shape = (28, 952) self.output_shape = (97, self.num_classes)