def __init__(self, regions, model_path, data_config={}):
     self.model_path = model_path
     self._load_vocab()
     self._load_meta()
     self._scaling = 1.0
     self._max_height = 10000
     self._max_width = 10000
     self.set_regions(regions)
     self.data_config = Configuration(data_config, DEFAULT_DATACONFIG)
     self.augmenter = ImageAugmenter(self.data_config)
Beispiel #2
0
    def __init__(self, **kwargs):
        self.paper_note_path = kwargs.get('paper_note_path',
                                          '../paper-notes/data/words')
        self.meta = Configuration(kwargs.get('meta', {}))
        self.data_config = Configuration(kwargs.get('data_config', {}))
        self.vocab = kwargs.get('vocab', {})
        self.pure = kwargs.get('pure', True)

        self.max_length = kwargs.get('max_length')
        self._load_data()
        self._compile_sets()
        self.augmenter = ImageAugmenter(self.data_config)
Beispiel #3
0
 def __init__(self, name, transpose=True, data_config={}):
     self.name = name
     self.data_config = Configuration(data_config)
     self.min_width_factor = 15
     self.max_min_width = 400
     self.datapath = os.path.join(util.OUTPUT_PATH, name)
     self._load_vocab()
     self._load_meta()
     self._load_sets()
     self._calc_max_length()
     self._compile_sets()
     self.transpose = transpose
     self.channels = 1
     self._fill_meta()
     self.augmenter = ImageAugmenter(self.data_config)
     self.unfiltered = {}
Beispiel #4
0
 def __init__(self, **kwargs):
     self.paper_note_path = kwargs.get(
         'paper_note_path', '../paper-notes/data/final')
     self.slice_width = kwargs.get('slice_width', 320)
     self.slice_height = kwargs.get('slice_height', 320)
     self.filter = kwargs.get('filter', True)
     self.binarize = kwargs.get('binarize', False)
     self.single_page = kwargs.get('single_page', False)
     self.slicer = Slicer(**kwargs)
     self.meta = Configuration({})
     self.shuffle = kwargs.get('shuffle', True)
     self.vocab = {}
     self._load_filelists()
     self.augmenter = ImageAugmenter(kwargs.get('config', {
         "otf_augmentations": {}
     }))
     self.otf_mentioned = False
Beispiel #5
0
class PaperNoteWords(Dataset):
    def __init__(self, **kwargs):
        self.paper_note_path = kwargs.get('paper_note_path',
                                          '../paper-notes/data/words')
        self.meta = Configuration(kwargs.get('meta', {}))
        self.data_config = Configuration(kwargs.get('data_config', {}))
        self.vocab = kwargs.get('vocab', {})
        self.pure = kwargs.get('pure', True)

        self.max_length = kwargs.get('max_length')
        self._load_data()
        self._compile_sets()
        self.augmenter = ImageAugmenter(self.data_config)

    def info(self):
        pass

    def _compile_set(self, dataset):
        for item in self.data[dataset]:
            item['compiled'] = self.compile(item['truth'])

    def _compile_sets(self):
        self._compile_set("train")
        self._compile_set("dev")
        self._compile_set("test")

    def _load_data(self):
        prefix = "pure_" if self.pure else ""
        self.data = {
            "dev": self._load_wordlist("{}dev".format(prefix)),
            "train": self._load_wordlist("{}train".format(prefix)),
            "test": self._load_wordlist("{}test".format(prefix)),
            "print_dev": self._load_classlist("dev"),
            "print_test": self._load_classlist("test"),
            "print_train": self._load_classlist("train"),
        }

    def _load_wordlist(self, subset):
        basepath = os.path.join(self.paper_note_path, subset)
        words = util.loadJson(basepath, "words")
        parsed = []
        for word in words:
            parsed.append(
                self._fileobj(basepath, "{}.png".format(word), words[word]))
        return parsed

    def _load_classlist(self, subset):
        files = self._load_filelist(subset, 1)
        files.extend(
            self._load_filelist("print_{}".format(subset), 0, len(files)))
        return files

    def _load_filelist(self, subset, is_htr, length=None) -> list:
        basepath = os.path.join(self.paper_note_path, subset)
        if os.path.exists(basepath):
            all_files = os.listdir(basepath)
            shuffle(all_files)
            length = len(all_files) if length is None else min(
                length, len(all_files))
            files = list(
                filter(lambda x: x.endswith(".png"), all_files[:length]))
            return list(
                map(lambda x: self._fileobj(basepath, x, is_htr), files))
        return []

    def _fileobj(self, basepath: str, filename: str, truth):
        return {
            "path": os.path.join(basepath, filename),
            "truth": truth,
        }

    def compile(self, text):
        parsed = [self.vocab[1][c] for c in text]
        parsed.extend([-1] * (self.max_length - len(text)))
        return parsed

    def decompile(self, values):
        def getKey(key):
            try:
                return self.vocab[0][str(key)]
            except KeyError:
                return ''

        return ''.join([getKey(c) for c in values])

    def getBatchCount(self, batch_size, max_batches=0, dataset="train"):
        total_len = len(self.data[dataset])
        num_batches = int(math.ceil(float(total_len) / batch_size))
        return min(num_batches,
                   max_batches) if max_batches > 0 else num_batches

    def generateBatch(self,
                      batch_size,
                      max_batches=0,
                      dataset="train",
                      with_filepath=False,
                      augmentable=False):
        num_batches = self.getBatchCount(batch_size, max_batches, dataset)
        if self.data_config.default('shuffle_epoch', False):
            shuffle(self.data[dataset])
        for b in range(num_batches):
            yield self._load_batch(b,
                                   batch_size,
                                   dataset,
                                   with_filepath,
                                   augmentable=augmentable)
        pass

    def load_image(self, path, transpose=False, augmentable=False):
        target_size = (
            int(self.meta["height"] -
                (self.data_config.default('preprocess.padding', 0) * 2)),
            int(self.meta["width"] -
                (self.data_config.default('preprocess.padding', 0) * 2)))
        x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if x is None or x.shape[0] == 0 or x.shape[1] == 0:
            return None
        x = self.augmenter.preprocess(x, target_size)
        if x is None:
            return None
        if self.data_config.default("otf_augmentations",
                                    False) and augmentable:
            x = self.augmenter.augment(x)
        else:
            x = self.augmenter.add_graychannel(x)

        if x.shape[1] != self.meta["width"] or x.shape[0] != self.meta[
                "height"]:
            x = self.augmenter.pad_to_size(x,
                                           width=self.meta["width"],
                                           height=self.meta["height"])

        return self.augmenter.add_graychannel(x)

    def _loadline(self, line, transpose=True, augmentable=False):
        l = len(line["truth"])
        y = np.asarray(line["compiled"])
        x = self.load_image(line["path"], augmentable=augmentable)
        return x, y, l, line["path"]

    def _loadprintline(self, line, transpose=True, augmentable=False):
        y = line["truth"]
        x = self.load_image(line["path"], augmentable=augmentable)
        return x, [y], 0, line["path"]

    def _load_batch(self,
                    index,
                    batch_size,
                    dataset,
                    with_filepath=False,
                    augmentable=False):
        X = []
        Y = []
        L = []
        F = []

        parseline = self._loadline if not dataset.startswith(
            "print_") else self._loadprintline

        for idx in range(
                index * batch_size,
                min((index + 1) * batch_size, len(self.data[dataset]))):
            x, y, l, f = parseline(self.data[dataset][idx],
                                   augmentable=augmentable)
            if x is not None:
                X.append(x)
                Y.append(y)
                L.append(l)
                F.append(f)
        X = np.asarray(X)
        Y = np.asarray(Y)
        L = np.asarray(L)
        if not with_filepath:
            return X, Y, L
        else:
            return X, Y, L, F

    # deprecated

    def generateEpochs(self,
                       batch_size,
                       num_epochs,
                       max_batches=0,
                       dataset="train",
                       with_filepath=False,
                       augmentable=False):
        for e in range(num_epochs):
            yield self.generateBatch(batch_size,
                                     max_batches=max_batches,
                                     dataset=dataset,
                                     with_filepath=with_filepath,
                                     augmentable=augmentable)
Beispiel #6
0
class PaperNoteSlices(Dataset):
    # just need to be regenerated if the dataset changes completely!
    average_sizes = {
        "dev": [3078, 2217],
        "test": [3066, 2206],
        "train": [3079, 2225]
    }
    file_iter = None

    def __init__(self, **kwargs):
        self.paper_note_path = kwargs.get(
            'paper_note_path', '../paper-notes/data/final')
        self.slice_width = kwargs.get('slice_width', 320)
        self.slice_height = kwargs.get('slice_height', 320)
        self.filter = kwargs.get('filter', True)
        self.binarize = kwargs.get('binarize', False)
        self.single_page = kwargs.get('single_page', False)
        self.slicer = Slicer(**kwargs)
        self.meta = Configuration({})
        self.shuffle = kwargs.get('shuffle', True)
        self.vocab = {}
        self._load_filelists()
        self.augmenter = ImageAugmenter(kwargs.get('config', {
            "otf_augmentations": {}
        }))
        self.otf_mentioned = False

    def info(self):
        pass

    def _load_filelists(self):
        self.data = {
            "dev": self._load_filelist("dev"),
            "train": self._load_filelist("train"),
            "test": self._load_filelist("test")
        }

    def _load_filelist(self, subset):
        basepath = os.path.join(self.paper_note_path, subset)
        all_files = os.listdir(basepath)
        files = list(filter(lambda x: x.endswith("-paper.png"), all_files))
        return list(map(lambda x: self._fileobj(basepath, x), files))

    def _fileobj(self, basepath: str, filename: str):
        num = filename.split("-")[0]
        return {
            "paper": os.path.join(basepath, filename),
            "stripped": os.path.join(basepath, "{}-stripped.png".format(num)),
        }

    def compile(self, text):
        return text

    def decompile(self, values):
        return values

    def merge_slices(self, slices, original_shape):
        return self.slicer.merge(slices, original_shape)

    def binarization(self, img):
        _, out = cv2.threshold(img, 254, 255, cv2.THRESH_BINARY)
        return self.graychannel(out)

    def graychannel(self, img):
        if len(img.shape) > 2:
            return img
        return np.reshape(img, [img.shape[0], img.shape[1], 1])

    def _load_file(self, fileobj, augmentable=False):
        paper = cv2.imread(fileobj["paper"], cv2.IMREAD_GRAYSCALE)
        stripped = cv2.imread(fileobj["stripped"], cv2.IMREAD_GRAYSCALE)
        if self.slice_height == -1 and self.slice_width == -1:
            paper = np.reshape(paper, [paper.shape[0], paper.shape[1], 1])
            stripped = np.reshape(
                stripped, [stripped.shape[0], stripped.shape[1], 1])
            return [paper], [stripped]
        slices_paper, slices_stripped = self.slicer(
            paper), self.slicer(stripped)
        final_paper, final_stripped = [], []
        for i in range(len(slices_paper)):
            m = np.min(slices_stripped[i])
            if m < 125 or not self.filter or (isinstance(self.filter, float) and np.random.uniform() < self.filter):
                p_slice = slices_paper[i]
                s_slice = slices_stripped[i]
                if augmentable:
                    p_slice, s_slice = self._augment_slice(p_slice, s_slice)
                s_slice = self.binarization(s_slice)
                if self.binarize:
                    p_slice = self.binarization(p_slice)
                final_paper.append(p_slice)
                final_stripped.append(s_slice)
        return final_paper, final_stripped

    def _augment_slice(self, paper, stripped):
        paper, settings = self.augmenter.augment(paper, True)
        stripped = self.augmenter.apply_augmentation(stripped, settings)
        return paper, stripped

    def _get_slices(self, paper, stripped, free):
        if free > len(paper):
            return paper, stripped, [], []
        else:
            return paper[:free], stripped[:free], paper[free:], stripped[free:]

    def _process_labels(self, label):
        label = np.asarray(label)
        label = np.int32(label/255.0)
        nx = label.shape[1]
        ny = label.shape[0]
        label = np.reshape(label, (ny, nx))
        labels = np.zeros((ny, nx, 2), dtype=np.float32)
        labels[..., 1] = label
        labels[..., 0] = 1 - label
        return labels

    # override
    def next_file(self, dataset):
        if self.file_iter is None:
            files = self.data[dataset]
            if self.shuffle:
                shuffle(files)
            self.file_iter = iter(files)
        try:
            self.file = next(self.file_iter)
            return True
        except StopIteration:
            return False

    def generateBatch(self, batch_size=0, max_batches=0, dataset="train", with_filepath=False, augmentable=False):
        surplus_paper = []
        surplus_stripped = []
        batch_paper = []
        batch_stripped = []
        total_batches = 0
        batch_size = batch_size if self.slice_height != - \
            1 or self.slice_width != -1 else 1
        cf = None
        last_batch = False
        while True:
            if self.slice_height == -1 and self.slice_width == -1:
                if not self.single_page:
                    if not self.next_file(dataset):
                        self.file_iter = None
                        break
                else:
                    if cf == self.file:
                        break
                batch_paper, batch_stripped = self._load_file(
                    self.file, augmentable)
            else:
                if len(surplus_paper) > 0 and len(batch_paper) < batch_size:
                    new_paper, new_stripped, surplus_paper, surplus_stripped = self._get_slices(
                        surplus_paper, surplus_stripped, batch_size - len(batch_paper))
                    batch_paper.extend(new_paper)
                    batch_stripped.extend(new_stripped)

                if len(batch_paper) < batch_size:
                    if not self.single_page:
                        if not self.next_file(dataset):
                            self.file_iter = None
                            break
                    else:
                        if cf == self.file:
                            last_batch = True
                        else:
                            cf = self.file
                    if not last_batch:
                        paper, stripped = self._load_file(
                            self.file, augmentable)
                        new_paper, new_stripped, surplus_paper, surplus_stripped = self._get_slices(
                            paper, stripped, batch_size - len(batch_paper))
                        batch_paper.extend(new_paper)
                        batch_stripped.extend(new_stripped)

            if len(batch_paper) >= batch_size or last_batch:
                if len(batch_paper) == 0:
                    break
                batch_paper = np.asarray(batch_paper)/255.0
                Y_ = []
                for y in batch_stripped:
                    Y_.append(self._process_labels(y))
                if with_filepath:
                    yield batch_paper, Y_, [], []
                else:
                    yield batch_paper, Y_, []
                total_batches += 1
                if max_batches > 0 and total_batches >= max_batches or last_batch:
                    break
                batch_paper = []
                batch_stripped = []
        self.file_iter = None
        pass

    # override
    def generateEpochs(self, batch_size, num_epochs, max_batches=0, dataset="train", with_filepath=False):
        return [self.generateBatch()]

    # override
    def getBatchCount(self, batch_size, max_batches=0, dataset="train"):
        batch_size = batch_size if self.slice_height != - \
            1 or self.slice_width != -1 else 1
        if self.slice_height == -1 and self.slice_width == -1:
            num_batches = len(self.data[dataset])
        else:
            num_batches = np.floor(float(np.prod(self.average_sizes[dataset]))/np.prod(
                [self.slice_height, self.slice_width]))*len(self.data[dataset])
        batch_count = np.ceil(num_batches/batch_size)
        return batch_count if max_batches == 0 else min(max_batches, batch_count)

    def _averageSize(self, subset):
        def get_image_size(file):
            img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
            return img.shape[:2]
        sizes = list(map(lambda x: get_image_size(
            x["paper"]), self.data[subset]))
        return np.average(sizes, axis=0)
class RegionDataset(Dataset):
    def __init__(self, regions, model_path, data_config={}):
        self.model_path = model_path
        self._load_vocab()
        self._load_meta()
        self._scaling = 1.0
        self._max_height = 10000
        self._max_width = 10000
        self.set_regions(regions)
        self.data_config = Configuration(data_config, DEFAULT_DATACONFIG)
        self.augmenter = ImageAugmenter(self.data_config)

    def info(self):
        self.meta('Dataset Configuration')

    def scaling(self, scaling, max_height, max_width):
        self.augmenter.config['preprocess.scale'] = scaling
        self._max_height = max_height
        self._max_width = max_width

    def _load_meta(self):
        self.meta = Configuration(util.loadJson(self.model_path, "data_meta"))

    def _load_vocab(self):
        self.vocab = util.loadJson(self.model_path, "vocab")
        self.vocab_length = len(self.vocab[0])

    def _load_sets(self):
        self.data = np.asarray(
            list(
                filter(lambda x: x is not None,
                       [self._loadimage(region) for region in self.regions])))

    def _loadimage(self, region):
        if region.img.shape[0] == 0 or region.img.shape[1] == 0:
            img = np.zeros((self.meta["height"], self.meta["width"]))
        elif len(region.img.shape) > 2:
            img = cv2.cvtColor(region.img, cv2.COLOR_BGR2GRAY)
        else:
            img = region.img
        target_size = (
            int(self.meta["height"] -
                (self.data_config.default('preprocess.padding', 0) * 2)),
            int(self.meta["width"] -
                (self.data_config.default('preprocess.padding', 0) * 2)))
        img = self.augmenter.preprocess(img, target_size)
        if img is not None:
            img = self.augmenter.postprocesss(img)
        if img is None:
            img = np.zeros((self.meta["height"], self.meta["width"]))
        return self.augmenter.add_graychannel(img)

    def set_regions(self, regions):
        self.regions = regions
        if regions is not None:
            self._load_sets()

    def compile(self, text):
        parsed = [self.vocab[1][c] for c in text]
        parsed.extend([-1] * (self.max_length - len(text)))
        return parsed

    def decompile(self, values):
        def getKey(key):
            try:
                return self.vocab[0][str(key)]
            except KeyError:
                return ''

        return ''.join([getKey(c) for c in values])

    def _load_batch(self, index, batch_size, dataset, with_filepath=False):
        batch_data = np.asarray(
            self.data[index * batch_size:min((index + 1) *
                                             batch_size, len(self.data))])
        if with_filepath:
            return batch_data, [], [], []
        else:
            return batch_data, [], []

    def generateBatch(self,
                      batch_size=0,
                      max_batches=0,
                      dataset="",
                      with_filepath=False):
        num_batches = self.getBatchCount(batch_size, max_batches, "")
        for b in range(num_batches):
            yield self._load_batch(b, batch_size, "", with_filepath)
        pass

    def generateEpochs(self,
                       batch_size,
                       num_epochs,
                       max_batches=0,
                       dataset="train",
                       with_filepath=False):
        return [self.generateBatch()]

    def getBatchCount(self, batch_size, max_batches=0, dataset=""):
        return int(np.ceil(len(self.data) / float(batch_size)))
Beispiel #8
0
class PreparedDataset(Dataset):
    def __init__(self, name, transpose=True, data_config={}):
        self.name = name
        self.data_config = Configuration(data_config)
        self.min_width_factor = 15
        self.max_min_width = 400
        self.datapath = os.path.join(util.OUTPUT_PATH, name)
        self._load_vocab()
        self._load_meta()
        self._load_sets()
        self._calc_max_length()
        self._compile_sets()
        self.transpose = transpose
        self.channels = 1
        self._fill_meta()
        self.augmenter = ImageAugmenter(self.data_config)
        self.unfiltered = {}

    def load_vocab(self, path):
        self._load_vocab(path)
        self._compile_sets()
        self._fill_meta()

    def info(self):
        self.meta('Dataset Configuration')

    def _load_meta(self):
        self.meta = Configuration(util.loadJson(self.datapath, "meta"))

    def _load_vocab(self, path=None):
        path = path or self.datapath
        self.vocab = util.loadJson(path, "vocab")
        self.vocab_length = len(self.vocab[0])

    def _fill_meta(self):
        self.meta['vocab.size'] = self.vocab_length
        self.meta['train.count'] = len(self.data['train'])
        self.meta['train.count'] = len(self.data['train'])
        self.meta['dev.count'] = len(self.data['dev'])
        self.meta['test.count'] = len(self.data['test'])
        if 'print_train' in self.data:
            self.meta['print_train.count'] = len(self.data['print_train'])
            self.meta['print_dev.count'] = len(self.data['print_dev'])
            self.meta['print_test.count'] = len(self.data['print_test'])

    def _load_sets(self):
        self.data = {
            "train": util.loadJson(self.datapath, "train"),
            "dev": util.loadJson(self.datapath, "dev"),
            "test": util.loadJson(self.datapath, "test")
        }
        if self.meta.default('printed', False):
            self.data['print_train'] = util.loadJson(self.datapath,
                                                     "print_train")
            self.data['print_dev'] = util.loadJson(self.datapath, "print_dev")
            self.data['print_test'] = util.loadJson(self.datapath,
                                                    "print_test")
        if self.data_config.default('sort_by_width', False):
            self._sort_by_width("train")
            self._sort_by_width("dev")
            if self.meta.default('printed', False):
                self._sort_by_width("print_train")
                self._sort_by_width("print_dev")

    def _sort_by_width(self, dataset):
        print("Sorting {} dataset by width...".format(dataset))
        for datapoint in self.data[dataset]:
            img = cv2.imread(datapoint["path"], cv2.IMREAD_GRAYSCALE)
            datapoint["width"] = img.shape[1]
        self.data[dataset].sort(key=lambda x: x["width"], reverse=True)

    def _compile_set(self, dataset):
        for item in self.data[dataset]:
            item['compiled'] = self.compile(item['truth'])

    def _filter_by_type(self, subset):
        filtered = []

        if subset not in self.unfiltered:
            self.unfiltered[subset] = self.data[subset]
        for file in self.unfiltered[subset]:
            if file['type'] in self.data_config['type_probs']:
                if np.random.uniform() <= self.data_config['type_probs'][
                        file['type']]:
                    filtered.append(file)
            else:
                filtered.append(file)
        self.data[subset] = filtered

    def _filter_data(self):
        if self.data_config.default('type_probs', False):
            self._filter_by_type('train')
            if self.meta.default('printed', False):
                self._filter_by_type('print_train')

    def _compile_sets(self):
        self._compile_set("train")
        self._compile_set("dev")
        self._compile_set("test")

    def _calc_max_length(self):
        _all = []
        _all.extend(self.data["train"])
        _all.extend(self.data["test"])
        _all.extend(self.data["dev"])
        self.max_length = max(map(lambda x: len(x["truth"]), _all))

    def compile(self, text):
        parsed = [self.vocab[1][c] for c in text]
        # if not self.dynamic_width:
        parsed.extend([-1] * (self.max_length - len(text)))
        return parsed

    def decompile(self, values):
        def getKey(key):
            try:
                return self.vocab[0][str(key)]
            except KeyError:
                return ''

        return ''.join([getKey(c) for c in values])

    def load_image(self, path, transpose=False, augmentable=False):
        x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if self.data_config.default("otf_augmentations",
                                    False) and augmentable:
            x = self.augmenter.augment(x)
        else:
            x = self.augmenter.add_graychannel(x)
        if transpose:
            try:
                x = np.transpose(x, [1, 0])
                if self.data_config.default('dynamic_width', False):
                    return self.augmenter.add_graychannel(x)
            except ValueError:
                return None, None, None, None
            if x.shape[0] != self.meta["width"] or x.shape[1] != self.meta[
                    "height"]:
                x = self.augmenter.pad_to_size(x,
                                               width=self.meta["width"],
                                               height=self.meta["height"])
            return self.augmenter.add_graychannel(x)
        else:
            if self.data_config.default('dynamic_width', False):
                return self.augmenter.add_graychannel(x)
            if x.shape[1] != self.meta["width"] or x.shape[0] != self.meta[
                    "height"]:
                x = self.augmenter.pad_to_size(x,
                                               width=self.meta["width"],
                                               height=self.meta["height"])
            return self.augmenter.add_graychannel(x)
        return x

    def _loadline(self, line, transpose=True, augmentable=False):
        l = len(line["truth"])
        y = np.asarray(line["compiled"])
        x = self.load_image(line["path"], augmentable=augmentable)
        return self.augmenter.postprocesss(x), y, l, line["path"]

    def _loadprintline(self, line, transpose=True, augmentable=False):
        y = line["truth"]
        x = self.load_image(line["path"], augmentable=augmentable)
        return self.augmenter.postprocesss(x), [y], 0, line["path"]

    def _load_batch(self,
                    index,
                    batch_size,
                    dataset,
                    with_filepath=False,
                    augmentable=False):
        X = []
        Y = []
        L = []
        F = []

        parseline = self._loadline if not dataset.startswith(
            "print_") else self._loadprintline
        for idx in range(
                index * batch_size,
                min((index + 1) * batch_size, len(self.data[dataset]))):
            x, y, l, f = parseline(self.data[dataset][idx],
                                   self.transpose,
                                   augmentable=augmentable)
            if x is not None:
                X.append(x)
                Y.append(y)
                L.append(l)
                F.append(f)
        if self.data_config.default('dynamic_width', False):
            L = np.asarray(L) + 5
            batch_width = np.max(list(map(lambda _x: _x.shape[1], X)))
            if batch_width < self.max_min_width:
                batch_width = max(batch_width,
                                  np.max(L) * self.min_width_factor)
            X_ = np.zeros((len(X), self.meta["height"], batch_width, 1),
                          dtype=np.int32)
            for idx in range(len(X)):
                X_[idx, 0:X[idx].shape[0], 0:X[idx].shape[1], :] = X[idx]
            X = X_
            Y = np.asarray(Y)
        else:
            X = np.asarray(X)
            Y = np.asarray(Y)
            L = np.asarray(L)
        if not with_filepath:
            return X, Y, L
        else:
            return X, Y, L, F

    def before_epoch(self, subset):
        if self.data_config.default('shuffle_epoch', False):
            if subset in self.unfiltered:
                shuffle(self.unfiltered[subset])
            else:
                shuffle(self.data[subset])
        self._filter_data()

    def generateBatch(self,
                      batch_size,
                      max_batches=0,
                      dataset="train",
                      with_filepath=False,
                      augmentable=False):
        num_batches = self.getBatchCount(batch_size, max_batches, dataset)
        for b in range(num_batches):
            yield self._load_batch(b,
                                   batch_size,
                                   dataset,
                                   with_filepath,
                                   augmentable=augmentable)
        pass

    # deprecated
    def generateEpochs(self,
                       batch_size,
                       num_epochs,
                       max_batches=0,
                       dataset="train",
                       with_filepath=False,
                       augmentable=False):
        for e in range(num_epochs):
            yield self.generateBatch(batch_size,
                                     max_batches=max_batches,
                                     dataset=dataset,
                                     with_filepath=with_filepath,
                                     augmentable=augmentable)

    def getBatchCount(self, batch_size, max_batches=0, dataset="train"):
        total_len = len(self.data[dataset])
        num_batches = int(math.ceil(float(total_len) / batch_size))
        return min(num_batches,
                   max_batches) if max_batches > 0 else num_batches