import os import shutil import zipfile from pathlib import Path import h5py import numpy as np import toml from boltons.cacheutils import cachedproperty from tensorflow.keras.utils import to_categorical from text_recognizer.datasets.dataset import _download_raw_dataset, Dataset, _parse_args SAMPLE_TO_BALANCE = True RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'emnist' METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml' PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'emnist' PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / 'byclass.h5' ESSENTIALS_FILENAME = Path( __file__).parents[0].resolve() / 'emnist_essentials.json' class EmnistDataset(Dataset): """ "The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19 and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset." From https://www.nist.gov/itl/iad/image-group/emnist-dataset
"""Emnist Lines dataset: synthetic handwriting lines dataset made from EMNIST characters.""" import os from collections import defaultdict from pathlib import Path import h5py import numpy as np from tensorflow.keras.utils import to_categorical from text_recognizer.datasets.dataset import Dataset from text_recognizer.datasets.emnist_dataset import EmnistDataset DATA_DIRNAME = Dataset.data_dirname() / "processed" / "emnist_lines" ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_lines_essentials.json" class EmnistLinesDataset(Dataset): """ EmnistLinesDataset class. Parameters ---------- max_length Max line length in characters. max_overlap Max overlap between characters in a line. num_train Number of training examples to generate. num_test Number of test examples to generate.
from pathlib import Path import numpy as np from text_recognizer.datasets.dataset import Dataset DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'emnist_lines' ESSENTIALS_FILENAME = Path( __file__).parents[0].resolve() / 'emnist_lines_essentials.json' class EmnistLinesDataset(Dataset): def __init__(self, max_length: int = 34, max_overlap: float = 0.33, num_train: int = 10000, num_test: int = 1000): pass @property def data_filename(self): pass def load_or_generate_data(self): pass def __repr__(self): pass def _load_data(self): pass
import zipfile from boltons.cacheutils import cachedproperty from tensorflow.keras.utils import to_categorical import h5py import numpy as np import toml import sys sys.path.append(r"C:\Users\bcche\fsdl-text-recognizer-project\lab1") from text_recognizer.datasets.dataset import _download_raw_dataset, Dataset, _parse_args SAMPLE_TO_BALANCE = True # If true, take at most the mean number of instances per class. RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "emnist" METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / "processed" / "emnist" PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "byclass.h5" ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_essentials.json" class EmnistDataset(Dataset): """ "The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19 and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset." From https://www.nist.gov/itl/iad/image-group/emnist-dataset The data split we will use is
"""Class for loading our own FSDL Handwriting dataset, which encompasses both paragraphs and lines.""" import json import numpy as np import toml from text_recognizer import util from text_recognizer.datasets.dataset import Dataset RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'fsdl_handwriting' METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml' PAGES_DIRNAME = RAW_DATA_DIRNAME / 'pages' class FsdlHandwritingDataset(Dataset): """ FSDL Handwriting dataset gathered in class. """ def __init__(self): self.metadata = toml.load(METADATA_FILENAME) with open(RAW_DATA_DIRNAME / self.metadata['filename']) as f: page_data = [json.loads(line) for line in f.readlines()] self.data_by_page_id = { id_: data for id_, data in (_extract_id_and_data(page_datum) for page_datum in page_data) } def load_or_generate_data(self): if len(self.page_filenames) < len(self.data_by_page_id): self._download_pages()
"""SentenceGenerator class and supporting functions.""" from typing import Optional from text_recognizer.datasets.dataset import Dataset NLTK_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'nltk' class SentenceGenerator: """Generate text sentences using the Brown corpus.""" def __init__(self, max_length: Optional[int] = None): pass def generate(self, max_length: Optional[int] = None) -> str: pass def brown_text(): pass def load_nltk_brown_corpus(): pass
"""IamParagraphsDataset class and functions for data processing.""" from boltons.cacheutils import cachedproperty import cv2 import numpy as np from text_recognizer.datasets.dataset import Dataset, _parse_args from text_recognizer.datasets.iam_dataset import IamDataset from text_recognizer import util INTERIM_DATA_DIRNAME = Dataset.data_dirname() / 'interim' / 'iam_paragraphs' DEBUG_CROPS_DIRNAME = INTERIM_DATA_DIRNAME / 'debug_crops' PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'iam_paragraphs' CROPS_DIRNAME = PROCESSED_DATA_DIRNAME / 'crops' GT_DIRNAME = PROCESSED_DATA_DIRNAME / 'gt' PARAGRAPH_BUFFER = 50 # pixels in the IAM form images to leave around the lines TEST_FRACTION = 0.2 class IamParagraphsDataset(Dataset): """ Paragraphs from the IAM dataset. """ def __init__(self, load_data: bool = True, subsample_fraction: float = None): self.iam_dataset = IamDataset() if load_data: self.iam_dataset.load_or_generate_data() self.num_classes = 3 self.input_shape = (256, 256) self.output_shape = (256, 256, self.num_classes)
"""Class for loading our own FSDL Handwriting dataset, which encompasses both paragraphs and lines.""" import json import numpy as np import toml from text_recognizer import util from text_recognizer.datasets.dataset import Dataset RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "fsdl_handwriting" METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" PAGES_DIRNAME = RAW_DATA_DIRNAME / "pages" class FsdlHandwritingDataset(Dataset): """ FSDL Handwriting dataset gathered in class. """ def __init__(self): self.metadata = toml.load(METADATA_FILENAME) with open(RAW_DATA_DIRNAME / self.metadata["filename"]) as f: page_data = [json.loads(line) for line in f.readlines()] # NOTE: pylint bug https://github.com/PyCQA/pylint/issues/3164 # pylint: disable=unnecessary-comprehension self.data_by_page_id = { id_: data for id_, data in (_extract_id_and_data(page_datum) for page_datum in page_data) } # pylint: enable=unnecessary-comprehension
"""SentenceGenerator class and supporting functions.""" import itertools import re import string from typing import Optional import nltk import numpy as np from text_recognizer.datasets.dataset import Dataset NLTK_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "nltk" class SentenceGenerator: """Generate text sentences using the Brown corpus.""" def __init__(self, max_length: Optional[int] = None): self.text = brown_text() self.word_start_inds = [0] + [ _.start(0) + 1 for _ in re.finditer(" ", self.text) ] self.max_length = max_length def generate(self, max_length: Optional[int] = None) -> str: """ Sample a string from text of the Brown corpus of length at least one word and at most max_length, padding it to max_length with the '_' character. """ if max_length is None: max_length = self.max_length if max_length is None:
IamLinesDataset class. We will use a processed version of this dataset, without including code that did the processing. We will look at how to generate processed data from raw IAM data in the IamParagraphsDataset. """ from boltons.cacheutils import cachedproperty import h5py from tensorflow.keras.utils import to_categorical from text_recognizer import util from text_recognizer.datasets.dataset import Dataset, _parse_args from text_recognizer.datasets.emnist_dataset import EmnistDataset PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'iam_lines' PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / 'iam_lines.h5' PROCESSED_DATA_URL = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam_lines.h5' class IamLinesDataset(Dataset): """ Note that we use cachedproperty because data takes time to load. """ def __init__(self, subsample_fraction: float = None): self.mapping = EmnistDataset().mapping self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.num_classes = len(self.mapping) self.input_shape = (28, 952) self.output_shape = (97, self.num_classes)
"""Class for loading the IAM dataset, which encompasses both paragraphs and lines, with associated utilities.""" import os from typing import Dict, List import xml.etree.ElementTree as ElementTree import zipfile from boltons.cacheutils import cachedproperty import toml from text_recognizer.datasets.dataset import Dataset, _download_raw_dataset RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'iam' METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml' EXTRACTED_DATASET_DIRNAME = RAW_DATA_DIRNAME / 'iamdb' DOWNSAMPLE_FACTOR = 2 # If images were downsampled, the regions must also be. LINE_REGION_PADDING = 0 # add this many pixels around the exact coordinates class IamDataset(Dataset): """ "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text, which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database The data split we will use is IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines. The validation set has been merged into the train set. The train set has 7,101 lines from 326 writers. The test set has 1,861 lines from 128 writers.
"""Class for loading the IAM dataset, which encompasses both paragraphs and lines, with associated utilities.""" import os from typing import Dict, List import xml.etree.ElementTree as ElementTree import zipfile from boltons.cacheutils import cachedproperty import toml from text_recognizer.datasets.dataset import Dataset, _download_raw_dataset RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "iam" METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" EXTRACTED_DATASET_DIRNAME = RAW_DATA_DIRNAME / "iamdb" DOWNSAMPLE_FACTOR = 2 # If images were downsampled, the regions must also be. LINE_REGION_PADDING = 0 # add this many pixels around the exact coordinates class IamDataset(Dataset): """ "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text, which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database The data split we will use is IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines. The validation set has been merged into the train set. The train set has 7,101 lines from 326 writers. The test set has 1,861 lines from 128 writers.
"""IamParagraphsDataset class and functions for data processing.""" from boltons.cacheutils import cachedproperty from tensorflow.keras.utils import to_categorical import cv2 import numpy as np from text_recognizer.datasets.dataset import Dataset, _parse_args from text_recognizer.datasets.iam_dataset import IamDataset from text_recognizer import util INTERIM_DATA_DIRNAME = Dataset.data_dirname() / "interim" / "iam_paragraphs" DEBUG_CROPS_DIRNAME = INTERIM_DATA_DIRNAME / "debug_crops" PROCESSED_DATA_DIRNAME = Dataset.data_dirname( ) / "processed" / "iam_paragraphs" CROPS_DIRNAME = PROCESSED_DATA_DIRNAME / "crops" GT_DIRNAME = PROCESSED_DATA_DIRNAME / "gt" PARAGRAPH_BUFFER = 50 # pixels in the IAM form images to leave around the lines TEST_FRACTION = 0.2 class IamParagraphsDataset(Dataset): """ Paragraphs from the IAM dataset. """ def __init__(self, subsample_fraction: float = None): self.iam_dataset = IamDataset() self.iam_dataset.load_or_generate_data() self.num_classes = 3 self.input_shape = (256, 256)
IamLinesDataset class. We will use a processed version of this dataset, without including code that did the processing. We will look at how to generate processed data from raw IAM data in the IamParagraphsDataset. """ from boltons.cacheutils import cachedproperty import h5py from tensorflow.keras.utils import to_categorical from text_recognizer import util from text_recognizer.datasets.dataset import Dataset, _parse_args from text_recognizer.datasets.emnist_lines_dataset import EmnistLinesDataset PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / "processed" / "iam_lines" PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "iam_lines.h5" PROCESSED_DATA_URL = "https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam_lines.h5" class IamLinesDataset(Dataset): """ Note that we use cachedproperty because data takes time to load. Parameters ---------- categorical_format If True, then y labels are given as one-hot vectors. with_start_and_end_tokens If True, start and end each sequence with special tokens subsample_fraction