Exemple #1
0
    def load_samples(self, processes=1, progress_bar=False):
        if self.loaded:
            return self._samples

        data = parallel_map(self._load_sample,
                            self._samples,
                            desc="Loading Dataset",
                            processes=processes,
                            progress_bar=progress_bar)

        invalid_samples = []
        for i, ((line, text), sample) in enumerate(zip(data, self._samples)):
            sample["image"] = line
            sample["text"] = text
            if line is not None and (line.size == 0
                                     or np.amax(line) == np.amin(line)):
                if self.skip_invalid:
                    invalid_samples.append(i)
                    print("Empty data: Image at '{}' is empty".format(
                        sample['id']))
                else:
                    raise Exception(
                        "Empty data: Image at '{}' is empty".format(
                            sample['id']))

        if self.remove_invalid:
            # remove all invalid samples (reversed order!)
            for i in sorted(invalid_samples, reverse=True):
                del self._samples[i]

        self.loaded = True

        return self._samples
Exemple #2
0
    def evaluate(_sentinel=None, gt_data=None, pred_data=None, processes=1, progress_bar=False):
        """ evaluate on the given raw data

        Parameters
        ----------
        _sentinel : do not use
            Forcing the use of `gt_dataset` and `pred_dataset` fore safety
        gt_data : Dataset, optional
            the ground truth
        pred_data : Dataset
            the prediction dataset
        processes : int, optional
            the processes to use for preprocesing and evaluation
        progress_bar : bool, optional
            show a progress bar

        Returns
        -------
        evaluation dictionary
        """
        if len(gt_data) != len(pred_data):
            raise Exception("Mismatch in gt and pred files count: {} vs {}".format(len(gt_data), len(pred_data)))

        # evaluate single lines
        out = parallel_map(Evaluator.evaluate_single_args, list(zip(gt_data, pred_data)),
                           processes=processes, progress_bar=progress_bar, desc="Evaluation")

        return Evaluator.evaluate_single_list(out, True)
    def augment_datas(self,
                      datas,
                      gt_txts,
                      n_augmentations,
                      processes=1,
                      progress_bar=False):
        if n_augmentations < 0 or not isinstance(n_augmentations, int):
            raise ValueError("Number of augmentation must be an integer >= 0")

        if n_augmentations == 0:
            return datas, gt_txts

        out = parallel_map(self.augment_data_tuple,
                           list(
                               zip(datas, gt_txts,
                                   [n_augmentations] * len(datas))),
                           desc="Augmentation",
                           processes=processes,
                           progress_bar=progress_bar)
        out_d, out_t = [], []
        for d, t in out:
            out_d += d
            out_t += t

        return datas + out_d, gt_txts + out_t
Exemple #4
0
    def apply(self, txts, processes=1, progress_bar=False):
        if isinstance(txts, str):
            return self._apply_single(txts)
        elif isinstance(txts, list):
            if len(txts) == 0:
                return []

            return parallel_map(self._apply_single, txts, desc="Text Preprocessing", processes=processes, progress_bar=progress_bar)
        else:
            raise Exception("Unknown instance of txts: {}. Supported list and str".format(type(txts)))
Exemple #5
0
    def apply(self, data, processes=1, progress_bar=False):
        if isinstance(data, np.ndarray):
            return self._apply_single(data)
        elif isinstance(data, list):
            if len(data) == 0:
                return []

            return parallel_map(self._apply_single, data, desc="Data Preprocessing", processes=processes, progress_bar=progress_bar)
        else:
            raise Exception("Unknown instance of txts: {}. Supported list and str".format(type(data)))
Exemple #6
0
    def load_samples(self, processes=1, progress_bar=False):
        """ Load the samples into the memory

        This is usefull if a FileDataset shall load its files.

        Parameters
        ----------
        processes : int
            number of processes to use for loading
        progress_bar : bool
            show a progress bar of the progress

        Returns
        -------
        list of samples
        """
        if self.loaded:
            return self._samples

        data = parallel_map(self._load_sample,
                            self._samples,
                            desc="Loading Dataset",
                            processes=processes,
                            progress_bar=progress_bar)

        invalid_samples = []
        for i, ((line, text), sample) in enumerate(zip(data, self._samples)):
            sample["image"] = line
            sample["text"] = text
            if self.mode == DataSetMode.PREDICT or self.mode == DataSetMode.TRAIN:
                # skip invalid imanges (e. g. corrupted or empty files)
                if line is None or (line.size == 0
                                    or np.amax(line) == np.amin(line)):
                    if self.skip_invalid:
                        invalid_samples.append(i)
                        if line is None:
                            print(
                                "Empty data: Image at '{}' is None (possibly corrupted)"
                                .format(sample['id']))
                        else:
                            print("Empty data: Image at '{}' is empty".format(
                                sample['id']))
                    else:
                        raise Exception(
                            "Empty data: Image at '{}' is empty".format(
                                sample['id']))

        if self.remove_invalid:
            # remove all invalid samples (reversed order!)
            for i in sorted(invalid_samples, reverse=True):
                del self._samples[i]

        self.loaded = True

        return self._samples
Exemple #7
0
    def evaluate(_sentinel=None, gt_data=None, pred_data=None, processes=1, progress_bar=False):
        """ evaluate on the given raw data

        Parameters
        ----------
        _sentinel : do not use
            Forcing the use of `gt_dataset` and `pred_dataset` fore safety
        gt_data : Dataset, optional
            the ground truth
        pred_data : Dataset
            the prediction dataset
        processes : int, optional
            the processes to use for preprocesing and evaluation
        progress_bar : bool, optional
            show a progress bar

        Returns
        -------
        evaluation dictionary
        """
        if len(gt_data) != len(pred_data):
            raise Exception("Mismatch in gt and pred files count: {} vs {}".format(len(gt_data), len(pred_data)))

        # evaluate single lines
        out = parallel_map(Evaluator.evaluate_single, list(zip(gt_data, pred_data)),
                           processes=processes, progress_bar=progress_bar, desc="Evaluation")

        # sum all errors up
        total_chars = 0
        total_char_errs = 0
        confusion = {}
        total_sync_errs = 0
        for chars, char_errs, sync_errs, conf in out:
            total_chars += chars
            total_char_errs += char_errs
            total_sync_errs += sync_errs
            for key, value in conf.items():
                if key not in confusion:
                    confusion[key] = value
                else:
                    confusion[key] += value

        # Note the sync errs can be higher than the true edit distance because
        # replacements are counted as 1
        # e.g. ed(in ewych, ierg ch) = 5
        #      sync(in ewych, ierg ch) = [{i: i}, {n: erg}, {ewy: }, {ch: ch}] = 6

        return {
            "single": out,
            "avg_ler": total_char_errs / total_chars,
            "total_chars": total_chars,
            "total_char_errs": total_char_errs,
            "total_sync_errs": total_sync_errs,
            "confusion": confusion,
        }
Exemple #8
0
    def apply(self, data, processes=1, progress_bar=False, max_tasks_per_child=100):
        if isinstance(data, np.ndarray):
            return self._apply_single(data)
        elif isinstance(data, list) or isinstance(data, tuple):
            if len(data) == 0:
                return []

            return parallel_map(self._apply_single, data, desc="Data Preprocessing",
                                processes=processes, progress_bar=progress_bar, max_tasks_per_child=max_tasks_per_child)
        else:
            raise Exception("Unknown instance of data: {}. Supported list and str".format(type(data)))
Exemple #9
0
    def augment_datas(self, datas, gt_txts, n_augmentations, processes=1, progress_bar=False):
        if n_augmentations <= 0:
            return datas, gt_txts

        out = parallel_map(self.augment_data_tuple, list(zip(datas, gt_txts, [n_augmentations] * len(datas))),
                           desc="Augmentation", processes=processes, progress_bar=progress_bar)
        out_d, out_t = [], []
        for d, t in out:
            out_d += d
            out_t += t

        return datas + out_d, gt_txts + out_t
    def augment_datas(self, datas, gt_txts, n_augmentations, processes=1, progress_bar=False):
        if n_augmentations <= 0:
            return datas, gt_txts

        out = parallel_map(self.augment_data_tuple, list(zip(datas, gt_txts, [n_augmentations] * len(datas))),
                           desc="Augmentation", processes=processes, progress_bar=progress_bar)
        out_d, out_t = [], []
        for d, t in out:
            out_d += d
            out_t += t

        return datas + out_d, gt_txts + out_t
Exemple #11
0
    def evaluate(_sentinel=None,
                 gt_data=None,
                 pred_data=None,
                 processes=1,
                 progress_bar=False):
        if len(gt_data) != len(pred_data):
            raise Exception(
                "Mismatch in gt and pred files count: {} vs {}".format(
                    len(gt_data), len(pred_data)))

        # evaluate single lines
        out = parallel_map(Evaluator.evaluate_single,
                           list(zip(gt_data, pred_data)),
                           processes=processes,
                           progress_bar=progress_bar,
                           desc="Evaluation")

        # sum all errors up
        total_chars = 0
        total_char_errs = 0
        confusion = {}
        total_sync_errs = 0
        for chars, char_errs, sync_errs, conf in out:
            total_chars += chars
            total_char_errs += char_errs
            total_sync_errs += sync_errs
            for key, value in conf.items():
                if key not in confusion:
                    confusion[key] = value
                else:
                    confusion[key] += value

        # Note the sync errs can be higher than the true edit distance because
        # replacements are counted as 1
        # e.g. ed(in ewych, ierg ch) = 5
        #      sync(in ewych, ierg ch) = [{i: i}, {n: erg}, {ewy: }, {ch: ch}] = 6

        return {
            "single": out,
            "avg_ler": total_char_errs / total_chars,
            "total_chars": total_chars,
            "total_char_errs": total_char_errs,
            "total_sync_errs": total_sync_errs,
            "confusion": confusion,
        }
Exemple #12
0
    def apply(self,
              data,
              processes=1,
              progress_bar=False,
              max_tasks_per_child=100):
        if isinstance(data, np.ndarray):
            return self._apply_single(data)
        elif isinstance(data, list) or isinstance(data, tuple):
            if len(data) == 0:
                return []

            return parallel_map(self._apply_single,
                                data,
                                desc="Data Preprocessing",
                                processes=processes,
                                progress_bar=progress_bar,
                                max_tasks_per_child=max_tasks_per_child)
        else:
            raise Exception(
                "Unknown instance of data: {}. Supported list and str".format(
                    type(data)))