예제 #1
0
    def preload_gt(self, gt_dataset, progress_bar=False):
        """ Preload gt to be used for several experiments

        Use this method to specify ground truth data to be tested versus many predictions

        Parameters
        ----------
        gt_dataset : Dataset
            the ground truth
        progress_bar : bool, optional
            show a progress bar

        """
        with StreamingInputDataset(gt_dataset,
                                   None,
                                   self.text_preprocessor,
                                   processes=1) as gt_input_dataset:
            self.preloaded_gt = [
                txt for _, txt, _ in tqdm_wrapper(
                    gt_input_dataset.generator(text_only=True),
                    total=len(gt_dataset),
                    progress_bar=progress_bar,
                    desc="Loading GT",
                )
            ]
예제 #2
0
    def run(self,
            _sentinel=None,
            gt_dataset=None,
            pred_dataset=None,
            processes=1,
            progress_bar=False):
        """ evaluate on the given dataset

        Parameters
        ----------
        _sentinel : do not use
            Forcing the use of `gt_dataset` and `pred_dataset` fore safety
        gt_dataset : Dataset, optional
            the ground truth
        pred_dataset : Dataset
            the prediction dataset
        processes : int, optional
            the processes to use for preprocesing and evaluation
        progress_bar : bool, optional
            show a progress bar

        Returns
        -------
        evaluation dictionary
        """
        if _sentinel:
            raise Exception("You must call run by using parameter names.")

        if self.preloaded_gt:
            gt_data = self.preloaded_gt
        else:
            # gt_dataset.load_samples(progress_bar=progress_bar)
            # gt_data = self.text_preprocessor.apply(gt_dataset.text_samples(), progress_bar=progress_bar)
            gt_input_dataset = InputDataset(gt_dataset,
                                            None,
                                            self.text_preprocessor,
                                            processes=processes)
            gt_data = [
                txt for _, txt, _ in tqdm_wrapper(
                    gt_input_dataset.generator(text_only=True),
                    total=len(gt_dataset),
                    progress_bar=progress_bar,
                )
            ]

        pred_input_dataset = InputDataset(pred_dataset,
                                          None,
                                          self.text_preprocessor,
                                          processes=processes)
        pred_data = [
            txt for _, txt, _ in tqdm_wrapper(
                pred_input_dataset.generator(text_only=True),
                total=len(pred_dataset),
                progress_bar=progress_bar,
            )
        ]

        return self.evaluate(gt_data=gt_data,
                             pred_data=pred_data,
                             processes=processes,
                             progress_bar=progress_bar,
                             skip_empty_gt=self.skip_empty_gt)
예제 #3
0
    def __init__(
        self,
        n_folds,
        dataset,
        output_dir,
        progress_bar=True,
    ):
        """ Prepare cross fold training

        This class creates folds out of the given source files.
        The individual splits are the optionally written to the `output_dir` in a json format.

        The file with index i will be assigned to fold i % n_folds (not randomly!)

        Parameters
        ----------
        n_folds : int
            the number of folds to create
        dataset : Dataset
            dataset containing all files
        output_dir : str
            where to store the folds
        """
        self.n_folds = n_folds
        self.dataset = dataset
        self.output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if len(self.dataset) == 0:
            raise Exception("Empty dataset")

        if self.n_folds <= 1:
            raise Exception("At least two folds are required")

        # fill single fold files

        # if a FileDataSet, we can just use the paths of the images
        if isinstance(self.dataset, FileDataSet):
            self.dataset_type = DataSetType.FILE
            self.folds = [[] for _ in range(self.n_folds)]
            for i, sample in enumerate(self.dataset.samples()):
                self.folds[i % n_folds].append(sample['image_path'])
        else:
            self.dataset_type = DataSetType.HDF5
            # else load the data of each fold and write it to hd5 data files
            input_dataset = InputDataset(self.dataset,
                                         NoopDataPreprocessor(),
                                         NoopTextProcessor(),
                                         processes=1)
            with ExitStack() as stack:
                folds = [
                    stack.enter_context(
                        Hdf5DatasetWriter(
                            os.path.join(self.output_dir, 'fold{}'.format(i))))
                    for i in range(self.n_folds)
                ]

                for i, (data, text, _) in tqdm_wrapper(
                        enumerate(input_dataset.generator(epochs=1)),
                        progress_bar=progress_bar,
                        total=len(dataset),
                        desc="Creating hdf5 files"):
                    folds[i % self.n_folds].write(data, text)

                self.folds = [f.files for f in folds]