def to_raw_input_dataset(self, processes=1, progress_bar=False, text_only=False) -> RawInputDataset: print("Preloading dataset type {} with size {}".format( self.dataset.mode, len(self))) prev = self._generate_only_non_augmented.value self._generate_only_non_augmented.value = True datas, texts, params = zip(*list( tqdm_wrapper(self.generator(epochs=1, text_only=text_only), desc="Preloading data", total=len(self.dataset), progress_bar=progress_bar))) preloaded_datas, preloaded_texts, preloaded_params = datas, texts, params self._generate_only_non_augmented.value = prev if self.dataset.mode == DataSetMode.TRAIN and self.data_augmentation_amount > 0: abs_n_augs = int(self.data_augmentation_amount ) if self.data_augmentation_amount >= 1 else int( self.data_augmentation_amount * len(self)) preloaded_datas, preloaded_texts \ = self.data_augmenter.augment_datas(list(datas), list(texts), n_augmentations=abs_n_augs, processes=processes, progress_bar=progress_bar) return RawInputDataset(self.mode, preloaded_datas, preloaded_texts, preloaded_params)
def _compute_current_cer_on_validation_set(self, count): def generate_cer(): it = iter(self.data_gen) for _ in range(count): yield np.mean(self.predict_func(next(it))[0]) return np.mean([cer for cer in tqdm_wrapper(generate_cer(), total=count, progress_bar=self.progress_bar, desc="Early stopping") if np.isfinite(cer)])
def from_input_dataset(input_dataset: List[InputDataset], whitelist=set(), progress_bar=False): chars = set(whitelist) for ds in input_dataset: if not ds: continue for text in tqdm_wrapper(ds.text_generator(), total=len(ds), desc="Computing codec", progress_bar=progress_bar): for c in text: chars.add(c) return Codec(sorted(list(chars)))
def from_input_dataset(input_dataset: List[InputDataset], whitelist=None, progress_bar=False): chars = set() if whitelist is None else set(whitelist) for ds in input_dataset: if not ds: continue for text in tqdm_wrapper(ds.text_generator(), total=len(ds), desc="Computing codec", progress_bar=progress_bar): for c in text: chars.add(c) return Codec(sorted(list(chars)))