def preload_gt(self, gt_dataset, progress_bar=False): """ Preload gt to be used for several experiments Use this method to specify ground truth data to be tested versus many predictions Parameters ---------- gt_dataset : Dataset the ground truth progress_bar : bool, optional show a progress bar """ with StreamingInputDataset(gt_dataset, None, self.text_preprocessor, processes=1) as gt_input_dataset: self.preloaded_gt = [ txt for _, txt, _ in tqdm_wrapper( gt_input_dataset.generator(text_only=True), total=len(gt_dataset), progress_bar=progress_bar, desc="Loading GT", ) ]
def run(self, _sentinel=None, gt_dataset=None, pred_dataset=None, processes=1, progress_bar=False): """ evaluate on the given dataset Parameters ---------- _sentinel : do not use Forcing the use of `gt_dataset` and `pred_dataset` fore safety gt_dataset : Dataset, optional the ground truth pred_dataset : Dataset the prediction dataset processes : int, optional the processes to use for preprocesing and evaluation progress_bar : bool, optional show a progress bar Returns ------- evaluation dictionary """ if _sentinel: raise Exception("You must call run by using parameter names.") if self.preloaded_gt: gt_data = self.preloaded_gt else: # gt_dataset.load_samples(progress_bar=progress_bar) # gt_data = self.text_preprocessor.apply(gt_dataset.text_samples(), progress_bar=progress_bar) gt_input_dataset = InputDataset(gt_dataset, None, self.text_preprocessor, processes=processes) gt_data = [ txt for _, txt, _ in tqdm_wrapper( gt_input_dataset.generator(text_only=True), total=len(gt_dataset), progress_bar=progress_bar, ) ] pred_input_dataset = InputDataset(pred_dataset, None, self.text_preprocessor, processes=processes) pred_data = [ txt for _, txt, _ in tqdm_wrapper( pred_input_dataset.generator(text_only=True), total=len(pred_dataset), progress_bar=progress_bar, ) ] return self.evaluate(gt_data=gt_data, pred_data=pred_data, processes=processes, progress_bar=progress_bar, skip_empty_gt=self.skip_empty_gt)
def __init__( self, n_folds, dataset, output_dir, progress_bar=True, ): """ Prepare cross fold training This class creates folds out of the given source files. The individual splits are the optionally written to the `output_dir` in a json format. The file with index i will be assigned to fold i % n_folds (not randomly!) Parameters ---------- n_folds : int the number of folds to create dataset : Dataset dataset containing all files output_dir : str where to store the folds """ self.n_folds = n_folds self.dataset = dataset self.output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) if len(self.dataset) == 0: raise Exception("Empty dataset") if self.n_folds <= 1: raise Exception("At least two folds are required") # fill single fold files # if a FileDataSet, we can just use the paths of the images if isinstance(self.dataset, FileDataSet): self.dataset_type = DataSetType.FILE self.folds = [[] for _ in range(self.n_folds)] for i, sample in enumerate(self.dataset.samples()): self.folds[i % n_folds].append(sample['image_path']) else: self.dataset_type = DataSetType.HDF5 # else load the data of each fold and write it to hd5 data files input_dataset = InputDataset(self.dataset, NoopDataPreprocessor(), NoopTextProcessor(), processes=1) with ExitStack() as stack: folds = [ stack.enter_context( Hdf5DatasetWriter( os.path.join(self.output_dir, 'fold{}'.format(i)))) for i in range(self.n_folds) ] for i, (data, text, _) in tqdm_wrapper( enumerate(input_dataset.generator(epochs=1)), progress_bar=progress_bar, total=len(dataset), desc="Creating hdf5 files"): folds[i % self.n_folds].write(data, text) self.folds = [f.files for f in folds]