def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ return Dictionary.load(filename)
def setup_dictionary(cls, args, **kwargs): dictionary = None output_dictionary = None if args.data: paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size) return (dictionary, output_dictionary)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" dict_path = os.path.join(args.data, "dict.txt") if not os.path.isfile(dict_path): raise FileNotFoundError("Dict not found: {}".format(dict_path)) tgt_dict = Dictionary.load(dict_path) if args.criterion == "ctc_loss": tgt_dict.add_symbol("<ctc_blank>") elif args.criterion == "asg_loss": for i in range(1, args.max_replabel + 1): tgt_dict.add_symbol(replabel_symbol(i)) print("| dictionary: {} types".format(len(tgt_dict))) return cls(args, tgt_dict)
def load_dataset(self, split, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ manifest = os.path.join(self.args.data, "{}.tsv".format(split)) self.datasets[split] = FileAudioDataset( manifest, sample_rate=self.args.sample_rate, max_sample_size=self.args.max_sample_size, min_sample_size=self.args.max_sample_size, min_length=self.args.min_sample_size, pad=self.args.labels is not None or self.args.enable_padding, normalize=self.args.normalize, ) dict_path = os.path.join(self.args.data, f"dict.{self.args.labels}.txt") self._target_dictionary = Dictionary.load(dict_path) label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}") labels = [] with open(label_path, "r") as f: for line in f: labels.append(line) process_label = LabelEncoder(self.target_dictionary) self.datasets[split] = AddTargetDataset( self.datasets[split], labels, bos=self.target_dictionary.bos(), pad=self.target_dictionary.pad(), eos=None, batch_targets=True, process_label=process_label, add_to_input=False, )