Esempio n. 1
0
    def __init__(
        self,
        utt_ids: List[str],
        rxfiles: List[str],
        utt2num_frames: Optional[List[int]] = None,
        feat_dim: Optional[
            int] = None,  # only relevant when reading from raw waveforms
        feature_type: Optional[
            str] = None,  # currently support fbank or mfcc; only relevant when reading from raw waveforms
        seed=1,
        feature_transforms_config: Optional[Dict[str, Any]] = None,
    ):
        super().__init__()
        assert len(utt_ids) == len(rxfiles)
        self.dtype = np.float
        self.utt_ids = utt_ids
        self.rxfiles = rxfiles
        self.size = len(utt_ids)  # number of utterances
        self.sizes = [
        ]  # length of each utterance in terms of the number of frames
        if utt2num_frames is not None and len(utt2num_frames) > 0:
            assert len(utt2num_frames) == self.size
            self.sizes = utt2num_frames

        first_rxfile = rxfiles[0]
        if re.search(r"\.ark:\d+$",
                     first_rxfile.strip()) is not None:  # from feats.scp
            self.input_format = "feat"
            self.feat_dim = kaldi_io.read_mat(first_rxfile).shape[
                1]  # feature dimension
        else:
            self.input_format = ("command" if re.search(
                r"\|$", first_rxfile.strip()) is not None else "wave")
            self.feat_dim = feat_dim
            self.feature_type = feature_type
            assert self.feat_dim is not None
            assert self.feature_type in ["fbank", "mfcc"]

        if len(self.sizes) == 0:
            logger.info("Computing number of frames from audios...")
            with ThreadPoolExecutor(max_workers=32) as ex:
                futures = []
                for rxfile in self.rxfiles:
                    futures.append(
                        ex.submit(compute_num_frames_from_feat_or_waveform,
                                  rxfile))

                for future in tqdm(futures, desc="Processing", leave=False):
                    result = future.result()
                    self.sizes.append(result)

        assert len(self.sizes) == self.size
        self.sizes = np.array(self.sizes, dtype=np.int32)
        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
            config=feature_transforms_config)
        self.seed = seed
        self.epoch = 1
Esempio n. 2
0
    def __init__(
        self,
        split: str,
        is_train_split: bool,
        cfg: S2TDataConfig,
        audio_paths: List[str],
        n_frames: List[int],
        src_texts: Optional[List[str]] = None,
        tgt_texts: Optional[List[str]] = None,
        speakers: Optional[List[str]] = None,
        src_langs: Optional[List[str]] = None,
        tgt_langs: Optional[List[str]] = None,
        ids: Optional[List[str]] = None,
        tgt_dict: Optional[Dictionary] = None,
        pre_tokenizer=None,
        bpe_tokenizer=None,
        n_frames_per_step=1,
        speaker_to_id=None,
        append_eos=True,
    ):
        self.split, self.is_train_split = split, is_train_split
        self.cfg = cfg
        self.audio_paths, self.n_frames = audio_paths, n_frames
        self.n_samples = len(audio_paths)
        assert len(n_frames) == self.n_samples > 0
        assert src_texts is None or len(src_texts) == self.n_samples
        assert tgt_texts is None or len(tgt_texts) == self.n_samples
        assert speakers is None or len(speakers) == self.n_samples
        assert src_langs is None or len(src_langs) == self.n_samples
        assert tgt_langs is None or len(tgt_langs) == self.n_samples
        assert ids is None or len(ids) == self.n_samples
        assert (tgt_dict is None and tgt_texts is None) or (
            tgt_dict is not None and tgt_texts is not None
        )
        self.src_texts, self.tgt_texts = src_texts, tgt_texts
        self.src_langs, self.tgt_langs = src_langs, tgt_langs
        self.speakers = speakers
        self.tgt_dict = tgt_dict
        self.check_tgt_lang_tag()
        self.ids = ids
        self.shuffle = cfg.shuffle if is_train_split else False

        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
            self.cfg.get_feature_transforms(split, is_train_split)
        )

        self.pre_tokenizer = pre_tokenizer
        self.bpe_tokenizer = bpe_tokenizer
        self.n_frames_per_step = n_frames_per_step
        self.speaker_to_id = speaker_to_id

        self.tgt_lens = self.get_tgt_lens_and_check_oov()
        self.append_eos = append_eos

        logger.info(self.__repr__())
Esempio n. 3
0
    def __init__(
        self,
        split: str,
        is_train_split: bool,
        data_cfg: S2TDataConfigSrc,
        audio_paths: List[str],
        n_frames: List[int],
        src_texts: Optional[List[str]] = None,
        tgt_texts: Optional[List[str]] = None,
        speakers: Optional[List[str]] = None,
        src_langs: Optional[List[str]] = None,
        tgt_langs: Optional[List[str]] = None,
        ids: Optional[List[str]] = None,
        tgt_dict: Optional[Dictionary] = None,
        src_dict: Optional[Dictionary] = None,
        pre_tokenizer=None,
        bpe_tokenizer=None,
    ):
        super().__init__(split, is_train_split, data_cfg, audio_paths,
                         n_frames, src_texts, tgt_texts, speakers, src_langs,
                         tgt_langs, ids, tgt_dict, pre_tokenizer,
                         bpe_tokenizer)
        self.split, self.is_train_split = split, is_train_split
        self.data_cfg = data_cfg
        self.audio_paths, self.n_frames = audio_paths, n_frames
        self.n_samples = len(audio_paths)
        assert len(n_frames) == self.n_samples > 0
        assert src_texts is None or len(src_texts) == self.n_samples
        assert tgt_texts is None or len(tgt_texts) == self.n_samples
        assert speakers is None or len(speakers) == self.n_samples
        assert src_langs is None or len(src_langs) == self.n_samples
        assert tgt_langs is None or len(tgt_langs) == self.n_samples
        assert ids is None or len(ids) == self.n_samples
        assert (tgt_dict is None
                and tgt_texts is None) or (tgt_dict is not None
                                           and tgt_texts is not None)
        assert (src_dict is None
                and src_texts is None) or (src_dict is not None
                                           and src_texts is not None)
        self.src_texts, self.tgt_texts = src_texts, tgt_texts
        self.src_langs, self.tgt_langs = src_langs, tgt_langs
        self.tgt_dict = tgt_dict
        self.src_dict = src_dict
        self.check_tgt_lang_tag()
        self.ids = ids
        self.shuffle = data_cfg.shuffle if is_train_split else False

        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
            self.data_cfg.get_feature_transforms(split, is_train_split))

        self.pre_tokenizer = pre_tokenizer
        self.bpe_tokenizer = bpe_tokenizer

        logger.info(self.__repr__())
Esempio n. 4
0
    def __init__(
        self,
        split: str,
        is_train_split: bool,
        data_cfg: S2TDataConfig,
        audio_paths: List[str],
        n_frames: List[int],
        audio_dict,
        align_time_min,
        align_time_max,
        total_time,
        src_texts: Optional[List[str]] = None,
        tgt_texts: Optional[List[str]] = None,
        speakers: Optional[List[str]] = None,
        src_langs: Optional[List[str]] = None,
        tgt_langs: Optional[List[str]] = None,
        ids: Optional[List[str]] = None,
        tgt_dict: Optional[Dictionary] = None,
        pre_tokenizer=None,
        bpe_tokenizer=None,
    ):
        self.split, self.is_train_split = split, is_train_split
        self.data_cfg = data_cfg
        self.audio_paths, self.n_frames = audio_paths, n_frames
        self.n_samples = len(audio_paths)
        assert len(n_frames) == self.n_samples > 0
        assert src_texts is None or len(src_texts) == self.n_samples
        assert tgt_texts is None or len(tgt_texts) == self.n_samples
        assert speakers is None or len(speakers) == self.n_samples
        assert src_langs is None or len(src_langs) == self.n_samples
        assert tgt_langs is None or len(tgt_langs) == self.n_samples
        assert ids is None or len(ids) == self.n_samples
        assert (tgt_dict is None
                and tgt_texts is None) or (tgt_dict is not None
                                           and tgt_texts is not None)
        self.src_texts, self.tgt_texts = src_texts, tgt_texts
        self.src_langs, self.tgt_langs = src_langs, tgt_langs
        self.tgt_dict = tgt_dict
        self.check_tgt_lang_tag()
        self.ids = ids
        self.shuffle = data_cfg.shuffle if is_train_split else False

        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
            self.data_cfg.get_feature_transforms(split, is_train_split))

        # For aligned augmentation
        self.align_time_min = align_time_min
        self.align_time_max = align_time_max
        self.audio_dict = audio_dict
        self.audio_dict_size = len(self.audio_dict)
        self.total_time = total_time

        # Used in the +AuioDict part of ADA-LM/ADA-RT
        self.max_samp_fbank = self.data_cfg.max_samp_fbank
        if self.max_samp_fbank is not None:
            assert isinstance(self.max_samp_fbank, int) and \
                self.max_samp_fbank >= 1
        self.num_samp_fbank = self.data_cfg.num_samp_fbank

        # Used in aligned masking (target side only  w/o audio dict)
        self.max_mask_fbank = self.data_cfg.max_mask_fbank
        self.num_mask_fbank = self.data_cfg.num_mask_fbank

        # % of data in a mini-batch to be applied with sampleFbank
        # prob: should be -1 when sample_fbank is not used
        self.sampleFbank_prob = self.data_cfg.sampleFbank_prob

        self.apply_alignAugment = self.data_cfg.apply_alignAugment

        self.roberta = None
        self.skip_roberta = self.data_cfg.skip_roberta
        logger.info('Skip roberta: {}'.format(self.skip_roberta))
        if self.apply_alignAugment:
            if not self.skip_roberta:
                from fairseq.models.roberta import RobertaModel
                self.roberta = RobertaModel.from_pretrained(
                    self.data_cfg.path_roberta, checkpoint_file='model.pt')

                if self.data_cfg.roberta_fp16:
                    self.roberta.half()

                logger.info('Inference of roberta with dtype: {}'.format(
                    (next(self.roberta.parameters())).dtype))
                self.roberta.cuda()
                self.roberta.eval()
            else:
                self.audio_dict_keys = list(self.audio_dict.keys())

        self.alignAugment_prob = self.data_cfg.alignAugment_prob
        self.alignMask = self.data_cfg.alignMask
        self.skip_source = self.data_cfg.skip_source
        self.percentMaskedTokens = self.data_cfg.percentMaskedTokens
        self.thresholdMaskedTokens = self.data_cfg.thresholdMaskedTokens
        if self.alignAugment_prob > 0 and self.alignAugment_prob <= 1:
            assert self.thresholdMaskedTokens >= 1

        self.random_time_mask_N = self.data_cfg.random_time_mask_N
        self.random_time_mask_T = self.data_cfg.random_time_mask_T
        self.random_time_mask_p = self.data_cfg.random_time_mask_p
        self.random_time_mask_limited = self.data_cfg.random_time_mask_limited
        if self.random_time_mask_N is not None \
            and self.random_time_mask_T is not None:
            self.time_mask_max = self.random_time_mask_N * \
                                    self.random_time_mask_T

        self.random_freq_mask_N = self.data_cfg.random_freq_mask_N
        self.random_freq_mask_F = self.data_cfg.random_freq_mask_F
        self.random_mask_value = self.data_cfg.random_mask_value  #specaugment after ADA
        self.align_mask_value = self.data_cfg.align_mask_value

        self.pre_tokenizer = pre_tokenizer
        self.bpe_tokenizer = bpe_tokenizer

        logger.info(self.__repr__())