Esempio n. 1
0
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        manifest = os.path.join(self.args.data, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=self.args.sample_rate,
            max_sample_size=self.args.max_sample_size,
            min_sample_size=self.args.max_sample_size,
            min_length=self.args.min_sample_size,
            pad=self.args.labels is not None or self.args.enable_padding,
            normalize=self.args.normalize,
        )
        if self.args.labels:
            label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}")
            labels = []
            with open(label_path, "r") as f:
                for line in f:
                    labels.append(line)

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=not self.is_ctc,
            )
        print('|*****dataset|', self.datasets[split])
Esempio n. 2
0
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        manifest = os.path.join(self.args.data, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=self.args.sample_rate,
            max_sample_size=self.args.max_sample_size,
            min_sample_size=self.args.max_sample_size,
            min_length=self.args.min_sample_size,
            pad=True,
            normalize=self.args.normalize,
        )

        label_path = os.path.join(self.args.data,
                                  f"{split}.{self.args.labels}")
        with open(label_path, "r") as f:
            labels = [
                line for i, line in enumerate(f)
                if i in self.datasets[split].line_inds
            ]
        process_label = LabelEncoder(self.dictionary)

        self.datasets[split] = AddTargetDataset(
            self.datasets[split],
            labels,
            pad=self.dictionary.pad(),
            bos=None,
            eos=None,
            batch_targets=True,
            process_label=process_label,
        )
Esempio n. 3
0
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        manifest = os.path.join(self.args.data, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=self.args.sample_rate,
            max_sample_size=self.args.max_sample_size,
            min_sample_size=self.args.max_sample_size,
            min_length=self.args.min_sample_size,
            pad=True,
            normalize=self.args.normalize,
        )

        label_path = os.path.join(self.args.data,
                                  f"{split}.{self.args.labels}")
        labels = self.load_labels(label_path)
        process_label = LabelEncoder(self.dictionary)

        self.datasets[split] = AddTargetDataset(self.datasets[split],
                                                labels,
                                                bos=self.dictionary.eos(),
                                                pad=self.dictionary.pad(),
                                                eos=None,
                                                batch_targets=True,
                                                process_label=process_label)
Esempio n. 4
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        data_path = self.cfg.data
        task_cfg = task_cfg or self.cfg

        # upgrade old task
        if isinstance(task_cfg, Namespace):
            if not hasattr(task_cfg, "autoregressive"):
                task_cfg.autoregressive = not task_cfg.criterion == 'ctc'

        manifest = os.path.join(data_path, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=task_cfg.get('sample_rate', self.cfg.sample_rate),
            max_sample_size=self.cfg.max_sample_size,
            min_sample_size=self.cfg.min_sample_size,
            pad=task_cfg.labels is not None or task_cfg.enable_padding,
            normalize=task_cfg.normalize,
            num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu),
            compute_mask_indices=(self.cfg.precompute_mask_indices
                                  or self.cfg.tpu),
            **self._get_mask_precompute_kwargs(task_cfg),
        )

        if self.cfg.tpu and task_cfg['mask_channel_prob'] == 0.0:
            logger.info(
                "Pretraining on TPUs may suffer convergence "
                "issues when training with `mask_channel_prob` value of "
                "0. You may want to set this to a low value close to 0.")

        if task_cfg.labels:
            label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
            with open(label_path, "r") as f:
                labels = [
                    line for i, line in enumerate(f)
                    if i in self.datasets[split].line_inds
                ]

            assert len(labels) == len(self.datasets[split]), (
                f"labels length ({len(labels)}) and dataset length "
                f"({len(self.datasets[split])}) do not match")

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=task_cfg.get('autoregressive', False),
            )
Esempio n. 5
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        data_path = self.cfg.data
        task_cfg = task_cfg or self.cfg

        # upgrade old task
        if isinstance(task_cfg, Namespace):
            if not hasattr(task_cfg, "autoregressive"):
                task_cfg.autoregressive = not task_cfg.criterion == 'ctc'

        manifest = os.path.join(data_path, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=task_cfg.sample_rate,
            max_sample_size=self.cfg.max_sample_size,
            min_sample_size=self.cfg.max_sample_size,
            min_length=self.cfg.min_sample_size,
            pad=task_cfg.labels is not None or task_cfg.enable_padding,
            normalize=task_cfg.normalize,
            num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu),
            compute_mask_indices=(self.cfg.precompute_mask_indices
                                  or self.cfg.tpu),
            **self._get_mask_precompute_kwargs(task_cfg),
        )

        if task_cfg.labels:
            label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
            labels = []
            with open(label_path, "r") as f:
                labels = [
                    line for i, line in enumerate(f)
                    if i in self.datasets[split].line_inds
                ]

            assert len(labels) == len(self.datasets[split]), (
                f"labels length ({len(labels)}) and dataset length "
                f"({len(self.datasets[split])}) do not match")

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=task_cfg.autoregressive,
            )
Esempio n. 6
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        data_path = self.cfg.data
        task_cfg = task_cfg or self.cfg

        # upgrade old task
        if isinstance(task_cfg, Namespace):
            if not hasattr(task_cfg, "autoregressive"):
                task_cfg.autoregressive = not task_cfg.criterion == 'ctc'

        manifest = os.path.join(data_path, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=task_cfg.get('sample_rate', self.cfg.sample_rate),
            max_sample_size=self.cfg.max_sample_size,
            min_sample_size=self.cfg.min_sample_size,
            pad=task_cfg.labels is not None or task_cfg.enable_padding,
            normalize=task_cfg.normalize,
        )

        if task_cfg.labels:
            label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
            skipped_indices = getattr(self.datasets[split], 'skipped_indices',
                                      set())
            with open(label_path, "r") as f:
                labels = [  # encode using bpe
                    self.encode(  # convert to lower case
                        ''.join(line.split()).lower().replace('|', ' '))
                    for i, line in enumerate(f) if i not in skipped_indices
                ]

            assert len(labels) == len(self.datasets[split]), (
                f"labels length ({len(labels)}) and dataset length "
                f"({len(self.datasets[split])}) do not match")

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=task_cfg.get('autoregressive', False),
            )
Esempio n. 7
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        task_cfg = task_cfg or self.cfg
        if split == 'train':
            data_dir = os.path.join(self.cfg.data, self.cfg.train_data)
        elif split == 'valid':
            data_dir = os.path.join(self.cfg.data, self.cfg.dev_data)
        else:
            data_dir = os.path.join(self.cfg.data, split)
        manifest = os.path.join(data_dir, 'cmvn_by_len_2.scp')

        ali_labels = os.path.join(
            self.cfg.data,
            '{}_ali'.format(split)) if self.cfg.ali_labels else None

        self.datasets[split] = KaldiFileDataset(
            manifest,
            task_cfg.enable_padding,
            task_cfg.max_sample_size,
            task_cfg.min_sample_size,
            task_cfg.low_frame_rate,
            ali_labels,
        )

        if task_cfg.labels:
            label_path = os.path.join(data_dir, f"{task_cfg.labels}")
            with open(label_path, "r") as f:
                labels = [
                    line for i, line in enumerate(f)
                    if i in self.datasets[split].line_inds
                ]

            assert len(labels) == len(self.datasets[split]), (
                f"labels length ({len(labels)}) and dataset length "
                f"({len(self.datasets[split])}) do not match")

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label)
Esempio n. 8
0
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        if split == 'train':
            path = 'data/train_960_splited'
        else:
            path = 'data/dev_clean_splited'
        manifest = os.path.join(path, "cmvn_by_len_2.scp")
        self.datasets[split] = KaldiFileDataset(
            manifest,
            sample_rate=self.args.sample_rate,
            max_sample_size=self.args.max_sample_size,
            min_sample_size=self.args.min_sample_size
            if not self.args.no_min_cropping else self.args.max_sample_size,
            min_length=self.args.min_sample_size,
            pad=self.args.labels is not None or self.args.enable_padding,
            normalize=self.args.normalize,
        )

        if self.args.labels:
            dict_path = os.path.join(self.args.data,
                                     f"dict.{self.args.labels}.txt")
            self._target_dictionary = Dictionary.load(dict_path)
            label_path = os.path.join(self.args.data,
                                      f"{split}.{self.args.labels}")
            labels = []
            with open(label_path, "r") as f:
                for line in f:
                    labels.append(line)

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=not self.is_ctc,
            )
Esempio n. 9
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        data_path = self.cfg.data
        task_cfg = task_cfg or self.cfg

        # upgrade old task
        if isinstance(task_cfg, Namespace):
            if not hasattr(task_cfg, "autoregressive"):
                task_cfg.autoregressive = not task_cfg.criterion == 'ctc'

        manifest = os.path.join(data_path, "{}.tsv".format(split))
        self.datasets[split] = FileAudioDataset(
            manifest,
            sample_rate=task_cfg.sample_rate,
            max_sample_size=self.cfg.max_sample_size,
            min_sample_size=self.cfg.max_sample_size,
            min_length=self.cfg.min_sample_size,
            pad=task_cfg.labels is not None or task_cfg.enable_padding,
            normalize=task_cfg.normalize,
        )

        if task_cfg.labels:
            label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
            labels = []
            with open(label_path, "r") as f:
                for line in f:
                    labels.append(line)

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=task_cfg.autoregressive,
            )
Esempio n. 10
0
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        self.datasets[split] = FileHandwritingDataset(
            self.args.data,
            split=split,
            max_sample_size=self.args.max_sample_size,
            min_sample_size=self.args.max_sample_size,
            pad_to_multiples_of=self.args.pad_to_multiples_of,
            min_length=self.args.min_sample_size,
            pad=self.args.labels is not None or self.args.enable_padding,
            normalize=self.args.normalize,
        )

        if self.args.labels:
            assert False  ## TODO(JCh): we must load labels from scribblelens.
            dict_path = os.path.join(self.args.data,
                                     f"dict.{self.args.labels}.txt")
            self._target_dictionary = Dictionary.load(dict_path)
            label_path = os.path.join(self.args.data,
                                      f"{split}.{self.args.labels}")
            labels = []
            with open(label_path, "r") as f:
                for line in f:
                    labels.append(line)

            process_label = LabelEncoder(self.target_dictionary)

            self.datasets[split] = AddTargetDataset(
                self.datasets[split],
                labels,
                pad=self.target_dictionary.pad(),
                eos=self.target_dictionary.eos(),
                batch_targets=True,
                process_label=process_label,
                add_to_input=not self.is_ctc,
            )
Esempio n. 11
0
    def load_dataset(
        self, split: str, task_cfg: AudioFinetuningConfig = None, **kwargs
    ):
        super().load_dataset(split, task_cfg, **kwargs)

        task_cfg = task_cfg or self.cfg
        assert task_cfg.labels is not None
        text_compression_level = getattr(
            TextCompressionLevel, str(self.cfg.text_compression_level)
        )
        data_path = self.cfg.data
        label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
        skipped_indices = getattr(self.datasets[split], "skipped_indices", set())
        text_compressor = TextCompressor(level=text_compression_level)
        with open(label_path, "r") as f:
            labels = [
                text_compressor.compress(l)
                for i, l in enumerate(f)
                if i not in skipped_indices
            ]

        assert len(labels) == len(self.datasets[split]), (
            f"labels length ({len(labels)}) and dataset length "
            f"({len(self.datasets[split])}) do not match"
        )

        process_label = LabelEncoder(self.target_dictionary)

        self.datasets[split] = AddTargetDataset(
            self.datasets[split],
            labels,
            pad=self.target_dictionary.pad(),
            eos=self.target_dictionary.eos(),
            batch_targets=True,
            process_label=process_label,
            label_len_fn=label_len_fn,
            add_to_input=task_cfg.get("autoregressive", False),
            text_compression_level=text_compression_level,
        )
Esempio n. 12
0
    def load_dataset(self,
                     split: str,
                     task_cfg: FairseqDataclass = None,
                     **kwargs):
        data_path_parent = self.cfg.data
        task_cfg = task_cfg or self.cfg
        data_path_list = [
            os.path.join(data_path_parent, path)
            for path in os.listdir(data_path_parent)
        ]

        # upgrade old task
        if isinstance(task_cfg, Namespace):
            if not hasattr(task_cfg, "autoregressive"):
                task_cfg.autoregressive = not task_cfg.criterion == "ctc"

        dataset_map = OrderedDict()
        datasets_lengths = []
        for data_path in data_path_list:
            if getattr(task_cfg, "binarized_dataset", False):
                dataset_map[data_path] = BinarizedAudioDataset(
                    data_path,
                    split=split,
                    sample_rate=task_cfg.get("sample_rate",
                                             self.cfg.sample_rate),
                    max_sample_size=self.cfg.max_sample_size,
                    min_sample_size=self.cfg.min_sample_size,
                    pad=task_cfg.labels is not None or task_cfg.enable_padding,
                    normalize=task_cfg.normalize,
                    num_buckets=self.cfg.num_batch_buckets
                    or int(self.cfg.tpu),
                    compute_mask_indices=(self.cfg.precompute_mask_indices
                                          or self.cfg.tpu),
                    **self._get_mask_precompute_kwargs(task_cfg),
                )
            else:
                manifest_path = os.path.join(data_path, "{}.tsv".format(split))

                dataset_map[data_path] = FileAudioDataset(
                    manifest_path=manifest_path,
                    sample_rate=task_cfg.get("sample_rate",
                                             self.cfg.sample_rate),
                    max_sample_size=self.cfg.max_sample_size,
                    min_sample_size=self.cfg.min_sample_size,
                    pad=task_cfg.labels is not None or task_cfg.enable_padding,
                    normalize=task_cfg.normalize,
                    num_buckets=self.cfg.num_batch_buckets
                    or int(self.cfg.tpu),
                    compute_mask_indices=(self.cfg.precompute_mask_indices
                                          or self.cfg.tpu),
                    **self._get_mask_precompute_kwargs(task_cfg),
                )

            if self.cfg.tpu and task_cfg["mask_channel_prob"] == 0.0:
                logger.info(
                    "Pretraining on TPUs may suffer convergence "
                    "issues when training with `mask_channel_prob` value of "
                    "0. You may want to set this to a low value close to 0.")

            if task_cfg.labels:
                label_path = os.path.join(data_path,
                                          f"{split}.{task_cfg.labels}")
                if os.path.exists(label_path):
                    skipped_indices = getattr(dataset_map[data_path],
                                              "skipped_indices", set())

                    with open(label_path, "r") as f:
                        labels = [
                            line for i, line in enumerate(f)
                            if i not in skipped_indices
                        ]

                    assert len(labels) == len(dataset_map[data_path]), (
                        f"labels length ({len(labels)}) and dataset length "
                        f"({len(dataset_map[data_path])}) do not match")

                    process_label = LabelEncoder(self.target_dictionary)

                    dataset_map[data_path] = AddTargetDataset(
                        dataset_map[data_path],
                        labels,
                        pad=self.target_dictionary.pad(),
                        eos=self.target_dictionary.eos(),
                        batch_targets=True,
                        process_label=process_label,
                        add_to_input=task_cfg.get("autoregressive", False),
                    )

            datasets_lengths.append(
                sum(dataset_map[data_path].sizes) / task_cfg.sample_rate /
                3600)

        datasets_lengths = np.array(datasets_lengths)
        self.sample_probs = self._get_sample_prob(datasets_lengths)
        size_ratio = (self.sample_probs *
                      datasets_lengths.sum()) / datasets_lengths
        for id, data_path in enumerate(data_path_list):
            logger.info(
                "Up/Down Sampling ratio by datasets: {} : {:.2f} to prob:{:.2f}".\
                    format(data_path.split('/')[-1], size_ratio[id],self.sample_probs[id])
            )

        self.datasets[split] = MultiCorpusSampledDataset(
            dataset_map, sampling_func=self.dataset_sampler)
        logger.info('{} {} examples'.format(split, len(self.datasets[split])))