def get_model_input(cls, task, audio: Union[str, torch.Tensor]): input_type = task.data_cfg.hub.get("input_type", "fbank80") if input_type == "fbank80_w_utt_cmvn": if isinstance(audio, str): feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio)) feat = feat.unsqueeze(0) # T x D -> 1 x T x D else: feat = kaldi.fbank(audio, num_mel_bins=80).numpy() # 1 x T x D elif input_type in {"waveform", "standardized_waveform"}: if isinstance(audio, str): feat, sr = get_wav(audio) # C x T feat, _ = convert_wav(feat, sr, to_sample_rate=16_000, to_mono=True) # C x T -> 1 x T else: feat = audio.numpy() else: raise ValueError(f"Unknown value: input_type = {input_type}") src_lengths = torch.Tensor([feat.shape[1]]).long() src_tokens = torch.from_numpy(feat) # 1 x T (x D) if input_type == "standardized_waveform": with torch.no_grad(): src_tokens = F.layer_norm(src_tokens, src_tokens.shape) return { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "prev_output_tokens": None, }, "target_lengths": None, "speaker": None, }
def __getitem__(self, index): # index=None import torchaudio import torchaudio.compliance.kaldi as kaldi # from . import kaldi as kaldi tgt_item = self.tgt[index] if self.tgt is not None else None print(index) path = self.aud_paths[index] if not os.path.exists(path): raise FileNotFoundError("Audio file not found: {}".format(path)) sound, sample_rate = torchaudio.load_wav(path) output = kaldi.fbank(sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift) output_cmvn = data_utils.apply_mv_norm(output) self.s2s_collater = Seq2SeqCollater(0, 1, pad_index=self.tgt_dict.pad(), eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True) return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
def __getitem__(self, index): import torchaudio import torchaudio.compliance.kaldi as kaldi tgt_item = self.tgt[index] if self.tgt is not None else None path = self.aud_paths[index] if not os.path.exists(path): raise FileNotFoundError("Audio file not found: {}".format(path)) vid_data = self.load_video(index) sound, sample_rate = torchaudio.load_wav(path) if self.video_offset > 0: # positive offset - audio and video padding_frame = np.zeros([self.video_offset, np.shape(vid_data)[1]], dtype='float32') vid_data = np.concatenate((padding_frame,vid_data),axis=0) elif self.video_offset < 0: # negativte offset - video and audio padding_frame = np.zeros([abs(self.video_offset), np.shape(vid_data)[1]], dtype='float32') vid_data = np.concatenate((vid_data, padding_frame),axis=0) aud_padding_size = int(abs(self.video_offset) * 40 * sample_rate * 0.001) aud_padding = torch.zeros_like(sound)[:,0:aud_padding_size] sound = torch.cat((aud_padding, sound), 1) output = kaldi.fbank( sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift ) output_cmvn = data_utils.apply_mv_norm(output) return {"id": index, "audio_data": [output_cmvn.detach(), tgt_item], "video_data": [vid_data, tgt_item]}
def __call__(self, batch): mean_stat = torch.zeros(self.feat_dim) var_stat = torch.zeros(self.feat_dim) number = 0 for item in batch: value = item[1].strip().split(",") assert len(value) == 3 or len(value) == 1 wav_path = value[0] sample_rate = torchaudio.backend.sox_backend.info(wav_path)[0].rate # len(value) == 3 means segmented wav.scp, # len(value) == 1 means original wa.scp if len(value) == 3: start_frame = int(float(value[1]) * sample_rate) end_frame = int(float(value[2]) * sample_rate) waveform, sample_rate = torchaudio.backend.sox_backend.load( filepath=wav_path, num_frames=end_frame - start_frame, offset=start_frame) waveform = waveform * (1 << 15) else: waveform, sample_rate = torchaudio.load_wav(item[1]) mat = kaldi.fbank(waveform, num_mel_bins=self.feat_dim, dither=0.0, energy_floor=0.0) mean_stat += torch.sum(mat, axis=0) var_stat += torch.sum(torch.square(mat), axis=0) number += mat.shape[0] return number, mean_stat, var_stat
def get_torchaudio_fbank_or_mfcc( waveform: np.ndarray, sample_rate: float, n_bins: int = 80, feature_type: str = "fbank", ) -> np.ndarray: """Get mel-filter bank or mfcc features via TorchAudio.""" try: import torchaudio.compliance.kaldi as ta_kaldi waveform = torch.from_numpy(waveform) if feature_type == "fbank": features = ta_kaldi.fbank( waveform, num_mel_bins=n_bins, sample_frequency=sample_rate ) else: features = ta_kaldi.mfcc( waveform, num_mel_bins=n_bins, num_ceps=40, low_freq=20, high_freq=-400, sample_frequency=sample_rate, ) return features.numpy() except ImportError: raise ImportError( "Please install torchaudio to enable online feature extraction: pip install torchaudio" )
def compute_fbank(data, num_mel_bins=23, frame_length=25, frame_shift=10, dither=0.0): """ Extract fbank Args: data: Iterable[{key, wav, label, sample_rate}] Returns: Iterable[{key, feat, label}] """ for sample in data: assert 'sample_rate' in sample assert 'wav' in sample assert 'key' in sample assert 'label' in sample sample_rate = sample['sample_rate'] waveform = sample['wav'] waveform = waveform * (1 << 15) # Only keep key, feat, label mat = kaldi.fbank(waveform, num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither, energy_floor=0.0, sample_frequency=sample_rate) yield dict(key=sample['key'], label=sample['label'], feat=mat)
def __call__(self, new_samples): samples = self.previous_residual_samples + new_samples if len(samples) < self.num_samples_per_window: self.previous_residual_samples = samples return # num_frames is the number of frames from the new segment num_frames = math.floor( (len(samples) - self.len_ms_to_samples(self.window_size - self.shift_size)) / self.num_samples_per_shift) # the number of frames used for feature extraction # including some part of thte previous segment effective_num_samples = int( num_frames * self.len_ms_to_samples(self.shift_size) + self.len_ms_to_samples(self.window_size - self.shift_size)) input_samples = samples[:effective_num_samples] self.previous_residual_samples = samples[num_frames * self.num_samples_per_shift:] torch.manual_seed(1) output = kaldi.fbank( torch.FloatTensor(input_samples).unsqueeze(0), num_mel_bins=self.feature_dim, frame_length=self.window_size, frame_shift=self.shift_size, ).numpy() output = self.transform(output) return torch.from_numpy(output)
def _fbank_features(self, sound): output = kaldi.fbank(sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift) output_cmvn = data_utils.apply_mv_norm(output).detach() return output_cmvn
def get_output_fn(sound, args): output = kaldi.fbank(sound, blackman_coeff=args[1], dither=0.0, energy_floor=args[2], frame_length=args[3], frame_shift=args[4], high_freq=args[5], htk_compat=args[6], low_freq=args[7], num_mel_bins=args[8], preemphasis_coefficient=args[9], raw_energy=args[10], remove_dc_offset=args[11], round_to_power_of_two=args[12], snip_edges=args[13], subtract_mean=args[14], use_energy=args[15], use_log_fbank=args[16], use_power=args[17], vtln_high=args[18], vtln_low=args[19], vtln_warp=args[20], window_type=args[21]) return output
def encode_signal(self, signal): if isinstance(signal, np.ndarray): signal = torch.from_numpy(signal) encoded = kaldi.fbank(signal, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift) encoded = apply_mv_norm(encoded) return encoded
def get_feats(self, file_path): wav, sr = sf.read(file_path) feats = torch.from_numpy(wav).float() feats = kaldi.fbank( feats.unsqueeze(0), num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, sample_frequency=sr, ) return feats
def _extract_fbank_features( self, waveform: np.ndarray, ) -> np.ndarray: """ Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs and hence the waveform should not be normalized before feature extraction. """ waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers waveform = torch.from_numpy(waveform).unsqueeze(0) features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) return features.numpy()
def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: """Get mel-filter bank features via TorchAudio.""" try: import torch import torchaudio.compliance.kaldi as ta_kaldi waveform = torch.from_numpy(waveform).unsqueeze(0) features = ta_kaldi.fbank( waveform, num_mel_bins=n_bins, sample_frequency=sample_rate ) return features.numpy() except ImportError: return None
def set_feats_func(self): # initialize feats_function if self.configs["feats"]["type"] == "mfcc_kaldi": from torchaudio.compliance.kaldi import mfcc self.feats_func = lambda x: mfcc(torch.from_numpy(x.astype("float32").reshape(1, -1)), **self.configs["mfcc_kaldi"]).transpose(0, 1) elif self.configs["feats"]["type"] == "fbank_kaldi": from torchaudio.compliance.kaldi import fbank self.feats_func = lambda x: fbank(torch.from_numpy(x.astype("float32").reshape(1, -1)), **self.configs["fbank_kaldi"]).transpose(0, 1) elif self.configs["feats"]["type"] == "spectrogram_kaldi": from torchaudio.compliance.kaldi import spectrogram self.feats_func = lambda x: spectrogram(torch.from_numpy(x.astype("float32").reshape(1, -1)), **self.configs["spectrogram_kaldi"]).transpose(0, 1) else: raise NotImplementedError
def __call__(self, batch): mean_stat = torch.zeros(self.feat_dim) var_stat = torch.zeros(self.feat_dim) number = 0 for item in batch: key = item[0] waveform, sample_rate = torchaudio.load_wav(item[1]) mat = kaldi.fbank(waveform, num_mel_bins=self.feat_dim, dither=0.0, energy_floor=0.0) mean_stat += torch.sum(mat, axis=0) var_stat += torch.sum(torch.square(mat), axis=0) number += mat.shape[0] return number, mean_stat, var_stat
def __getitem__(self, index): import torchaudio import torchaudio.compliance.kaldi as kaldi tgt_item = self.tgt[index] if self.tgt is not None else None path = self.aud_paths[index] if not os.path.exists(path): raise FileNotFoundError("Audio file not found: {}".format(path)) sound, sample_rate = torchaudio.load_wav(path) output = kaldi.fbank(sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift) output_cmvn = data_utils.apply_mv_norm(output) return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
def __call__(self, batch): mean_stat = torch.zeros(self.feat_dim) var_stat = torch.zeros(self.feat_dim) number = 0 for item in batch: try: value = item[1].strip().split(",") assert len(value) == 3 or len(value) == 1 wav_path = value[0] sample_rate = torchaudio.backend.sox_io_backend.info( wav_path).sample_rate resample_rate = sample_rate # len(value) == 3 means segmented wav.scp, # len(value) == 1 means original wav.scp if len(value) == 3: start_frame = int(float(value[1]) * sample_rate) end_frame = int(float(value[2]) * sample_rate) waveform, sample_rate = torchaudio.backend.sox_io_backend.load( filepath=wav_path, num_frames=end_frame - start_frame, frame_offset=start_frame) else: waveform, sample_rate = torchaudio.load(item[1]) waveform = waveform * (1 << 15) if self.resample_rate != 0 and self.resample_rate != sample_rate: resample_rate = self.resample_rate waveform = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=resample_rate)(waveform) mat = kaldi.fbank(waveform, num_mel_bins=self.feat_dim, dither=0.0, energy_floor=0.0, sample_frequency=resample_rate) mean_stat += torch.sum(mat, axis=0) var_stat += torch.sum(torch.square(mat), axis=0) number += mat.shape[0] except (Exception) as e: print('read utterance {} error'.format(item[0]), file=sys.stdout) pass return number, mean_stat, var_stat
def wav_feat(wav_file, feature_extraction_conf): # for torchaudio<0.8 if hasattr(wav_file, 'read'): waveform, sample_rate = wavform_filelike(wav_file) elif isinstance(wav_file, str): waveform, sample_rate = torchaudio.load_wav(wav_file) elif isinstance(wav_file, bytes): waveform, sample_rate = wavfrom_bytes(wav_file) wav_dither = 1.0 mat = kaldi.fbank( waveform, num_mel_bins=feature_extraction_conf['mel_bins'], frame_length=feature_extraction_conf['frame_length'], frame_shift=feature_extraction_conf['frame_shift'], dither=wav_dither, energy_floor=0.0, sample_frequency=sample_rate ) feat = mat length = mat.shape[0] return feat.unsqueeze(0), torch.tensor([length])
def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: """Get mel-filter bank features via TorchAudio.""" try: import torch import torchaudio.compliance.kaldi as ta_kaldi import torchaudio.sox_effects as ta_sox waveform = torch.from_numpy(waveform) if len(waveform.shape) == 1: # Mono channel: D -> 1 x D waveform = waveform.unsqueeze(0) else: # Merge multiple channels to one: C x D -> 1 x D waveform, _ = ta_sox.apply_effects_tensor(waveform, sample_rate, ['channels', '1']) features = ta_kaldi.fbank(waveform, num_mel_bins=n_bins, sample_frequency=sample_rate) return features.numpy() except ImportError: return None
predictions = {} for emo_representation in self.emo_representations: seq_len = self.args[emo_representation]['args'].seq_length mean = self.args[emo_representation]['norm_mean'] std = self.args[emo_representation]['norm_std'] # feature normalization and padding has to be done for each # emotion representation individually because the means and # standard (deviations) (and sequence length) can be different features = normalize_and_pad_features(fbank, seq_len, torch.from_numpy(mean), torch.from_numpy(std)) predictions[emo_representation] = softmax( self.models[emo_representation](features.unsqueeze(1)), dim=1 ).detach().numpy() arousal_level = self.arousal_mapping[np.argmax(predictions['arousal'])] valence_level = self.valence_mapping[np.argmax(predictions['valence'])] category_label = self.category_mapping[np.argmax(predictions['category'])] return {'emotion': {'arousal': arousal_level, 'valence': valence_level, 'category': category_label, 'cateogry_probabilities': np.around(predictions['category'], 2).reshape(-1)}} if __name__ == '__main__': recognition = EmotionRecognition() waveform, sample_rate = torchaudio.load_wav('test.wav') f = fbank(waveform, sample_frequency=sample_rate) emo = recognition.predict_from_audio(f) print(emo)
def _feature_fn(self, *args, **kwargs): from torchaudio.compliance.kaldi import fbank return fbank(*args, **kwargs)
def forward(self, input, sample_rate): feat = kaldi.fbank( input, channel=-1, sample_frequency=sample_rate, num_mel_bins=self.n_mels ) return feat
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' # print(args) print() print("*******************") print(args.task) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x def collate_frames(frames): """Convert a list of 2d frames into a padded 3d tensor Args: frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is length of i-th frame and f_dim is static dimension of features Returns: 3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i] """ len_max = max(frame.size(0) for frame in frames) print(frames.size()) print(frames[0].size()) f_dim = frames[0].size(1) res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0) for i, v in enumerate(frames): res[i, :v.size(0)] = v return res # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) # if args.buffer_size > 1: # print('| Sentence buffer size:', args.buffer_size) # print('| Type the input sentence and press return:') start_id = 0 audio_root_path = "/search/odin/haiyang/fairseq_exp/e2e_trans/fairseq/examples/speech_recognition/datasets/zh_asr_data/train/train2" with open(args.input) as inp: input = inp.readline().strip() while input: print() audio_path = audio_root_path + "/" + input.split(" ")[0] + ".wav" inputs = [" ".join(input.split(" ")[1:])] results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() if args.task == "translation": sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } else: if not os.path.exists(audio_path): raise FileNotFoundError( "Audio file not found: {}".format(audio_path)) sound, sample_rate = torchaudio.load_wav(audio_path) num_mel_bins, frame_length, frame_shift = 80, 25.0, 10.0 output = kaldi.fbank(sound, num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=0.0, energy_floor=1.0) frames = data_utils.apply_mv_norm(output).detach()[ None, :, :].type(torch.cuda.FloatTensor) # print(output_cmvn) # frames = collate_frames(output_cmvn) # sort samples by descending number of frames # frames_lengths = torch.cuda.LongTensor(frames.size()[1]) frames_lengths = torch.LongTensor( [s.size(0) for s in frames]) sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, "audio": frames, "audio_lengths": frames_lengths }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate( zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((start_id + id, src_tokens_i, hypos)) # sort output to match input order for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = decode_fn(hypo_str) print('H-{}\t{}'.format(id, hypo_str)) # print('H-{}\t{}\t{}'.format(id, hypo['score'], hypo_str)) # print('P-{}\t{}'.format( # id, # ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist())) # )) if args.print_alignment: alignment_str = " ".join([ "{}-{}".format(src, tgt) for src, tgt in alignment ]) print('A-{}\t{}'.format(id, alignment_str)) input = inp.readline().strip() # update running id counter start_id += len(inputs)
audio_list = load_wav_segments(args.wav_scp, args.segments) count = 0 with open(args.out_ark, 'wb') as ark_fout, \ open(args.out_scp, 'w', encoding='utf8') as scp_fout: for item in audio_list: if len(item) == 2: key, wav_path = item waveform, sample_rate = torchaudio.load_wav(wav_path) else: assert len(item) == 4 key, wav_path, start, end = item sample_rate = torchaudio.info(wav_path).sample_rate frame_offset = int(start * sample_rate) num_frames = int((end - start) * sample_rate) waveform, sample_rate = torchaudio.load_wav( wav_path, frame_offset, num_frames) mat = kaldi.fbank(waveform, num_mel_bins=args.num_mel_bins, frame_length=args.frame_length, frame_shift=args.frame_shift, dither=args.dither, energy_floor=0.0, sample_frequency=sample_rate) mat = mat.detach().numpy() kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) count += 1 if count % 10000 == 0: logging.info('Progress {}/{}'.format(count, len(audio_list)))
def _extract_feature(batch, speed_perturb, wav_distortion_conf, feature_extraction_conf): """ Extract acoustic fbank feature from origin waveform. Speed perturbation and wave amplitude distortion is optional. Args: batch: a list of tuple (wav id , wave path). speed_perturb: bool, whether or not to use speed pertubation. wav_distortion_conf: a dict , the config of wave amplitude distortion. feature_extraction_conf:a dict , the config of fbank extraction. Returns: (keys, feats, labels) """ keys = [] feats = [] lengths = [] wav_dither = wav_distortion_conf['wav_dither'] wav_distortion_rate = wav_distortion_conf['wav_distortion_rate'] distortion_methods_conf = wav_distortion_conf['distortion_methods'] if speed_perturb: speeds = [1.0, 1.1, 0.9] weights = [1, 1, 1] speed = random.choices(speeds, weights, k=1)[0] # speed = random.choice(speeds) for i, x in enumerate(batch): try: wav = x[1] value = wav.strip().split(",") # 1 for general wav.scp, 3 for segmented wav.scp assert len(value) == 1 or len(value) == 3 wav_path = value[0] sample_rate = torchaudio.backend.sox_io_backend.info( wav_path).sample_rate if 'resample' in feature_extraction_conf: resample_rate = feature_extraction_conf['resample'] else: resample_rate = sample_rate if speed_perturb: if len(value) == 3: logging.error( "speed perturb does not support segmented wav.scp now") assert len(value) == 1 waveform, sample_rate = _load_wav_with_speed(wav_path, speed) else: # value length 3 means using segmented wav.scp # incluede .wav, start time, end time if len(value) == 3: start_frame = int(float(value[1]) * sample_rate) end_frame = int(float(value[2]) * sample_rate) waveform, sample_rate = torchaudio.backend.sox_io_backend.load( filepath=wav_path, num_frames=end_frame - start_frame, frame_offset=start_frame) else: waveform, sample_rate = torchaudio.load(wav_path) waveform = waveform * (1 << 15) if resample_rate != sample_rate: waveform = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=resample_rate)(waveform) if wav_distortion_rate > 0.0: r = random.uniform(0, 1) if r < wav_distortion_rate: waveform = waveform.detach().numpy() waveform = _waveform_distortion(waveform, distortion_methods_conf) waveform = torch.from_numpy(waveform) mat = kaldi.fbank( waveform, num_mel_bins=feature_extraction_conf['mel_bins'], frame_length=feature_extraction_conf['frame_length'], frame_shift=feature_extraction_conf['frame_shift'], dither=wav_dither, energy_floor=0.0, sample_frequency=resample_rate) mat = mat.detach().numpy() feats.append(mat) keys.append(x[0]) lengths.append(mat.shape[0]) except (Exception) as e: print(e) logging.warn('read utterance {} error'.format(x[0])) pass # Sort it because sorting is required in pack/pad operation order = np.argsort(lengths)[::-1] sorted_keys = [keys[i] for i in order] sorted_feats = [feats[i] for i in order] labels = [x[2].split() for x in batch] labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels] sorted_labels = [labels[i] for i in order] return sorted_keys, sorted_feats, sorted_labels
def __getitem__(self, index): tgt_item = self.tgt[index] if self.tgt is not None else None src_item = self.src[index] # Append EOS to end of tgt sentence if it does not have an EOS and remove # EOS from end of src sentence if it exists. This is useful when we use # use existing datasets for opposite directions i.e., when we want to # use tgt_dataset as src_dataset and vice versa if self.append_eos_to_target: eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos() if self.tgt and self.tgt[index][-1] != eos: tgt_item = torch.cat( [self.tgt[index], torch.LongTensor([eos])]) if self.append_bos: bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos() if self.tgt and self.tgt[index][0] != bos: tgt_item = torch.cat( [torch.LongTensor([bos]), self.tgt[index]]) bos = self.src_dict.bos() if self.src[index][-1] != bos: src_item = torch.cat( [torch.LongTensor([bos]), self.src[index]]) if self.remove_eos_from_source: eos = self.src_dict.eos() if self.src[index][-1] == eos: src_item = self.src[index][:-1] # print(self.audio[]) path = self.audio[str(index)]['input']['path'] if not os.path.exists(path): raise FileNotFoundError("Audio file not found: {}".format(path)) # print(path) # print(index) # exit() sound, sample_rate = torchaudio.load_wav(path) # print(self.num_mel_bins) # print(self.frame_length) # print(self.frame_shift) # print("&&&&&&&&&&&&&&&&&") # exit() # if "20170001P00053I0108" in path: # pp=True # else: # pp=False output = kaldi.fbank(sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift, dither=0.0, energy_floor=1.0) output_cmvn = data_utils.apply_mv_norm(output) # if "20170001P00053I0108" in path: # print(path) # print(sound) # print(sample_rate) # print("*******") # print("output") # print(output) # print("output_cmvn") # print(output_cmvn) # self.s2s_collater = Seq2SeqCollater( # 0, 1, pad_index=self.tgt_dict.pad(), # eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True # ) # return {"id": index, "data": [output_cmvn.detach(), tgt_item]} example = { 'id': index, 'audio': output_cmvn.detach(), 'source': src_item, 'target': tgt_item, } if self.align_dataset is not None: example['alignment'] = self.align_dataset[index] return example
def _extract_feature(batch, speed_perturb, wav_distortion_conf, feature_extraction_conf, speech_aug, acousticsimulator, acoustic_simulator_conf): """ Extract acoustic fbank feature from origin waveform. Speed perturbation and wave amplitude distortion is optional. Args: batch: a list of tuple (wav id , wave path). speed_perturb: bool, whether or not to use speed pertubation. wav_distortion_conf: a dict , the config of wave amplitude distortion. feature_extraction_conf:a dict , the config of fbank extraction. Returns: (keys, feats, labels) """ keys = [] feats = [] lengths = [] wav_dither = wav_distortion_conf['wav_dither'] wav_distortion_rate = wav_distortion_conf['wav_distortion_rate'] distortion_methods_conf = wav_distortion_conf['distortion_methods'] if acoustic_simulator_conf is not None: acoustic_simulator_samplerate = acoustic_simulator_conf['samplerate'] if speed_perturb: speeds = [1.0, 1.1, 0.9] weights = [1, 1, 1] speed = random.choices(speeds, weights, k=1)[0] # speed = random.choice(speeds) for i, x in enumerate(batch): try: wav = x[1] value = wav.strip().split(",") # 1 for general wav.scp, 3 for segmented wav.scp assert len(value) == 1 or len(value) == 3 wav_path = value[0] sample_rate = torchaudio.backend.sox_io_backend.info( wav_path).sample_rate # 编解码模拟 # (主要考虑16k降采样为8k 然后信道的编解码模拟) # (因此,这里判断语音原始的采样率是否为8k) # (也可以设置成其他的采样(只支持16k、8k),acoustic_simulator_conf['samplerate']) # (为8k的,代表原始语音为电话语音,不进行模拟) # (不为8k的,代表原始语音不为电话语音,需要进行模拟) if acousticsimulator is not None: if acoustic_simulator_samplerate != sample_rate: # print(wav_path) # print('acoustic simulator') simulatored_wav_path = acousticsimulator.cmdFile(wav_path) sample_rate = acoustic_simulator_samplerate wav_path = simulatored_wav_path # print(wav_path) if 'resample' in feature_extraction_conf: resample_rate = feature_extraction_conf['resample'] else: resample_rate = sample_rate if speed_perturb: if len(value) == 3: logging.error( "speed perturb does not support segmented wav.scp now") assert len(value) == 1 waveform, sample_rate = _load_wav_with_speed(wav_path, speed) else: # value length 3 means using segmented wav.scp # incluede .wav, start time, end time if len(value) == 3: start_frame = int(float(value[1]) * sample_rate) end_frame = int(float(value[2]) * sample_rate) waveform, sample_rate = torchaudio.backend.sox_io_backend.load( filepath=wav_path, num_frames=end_frame - start_frame, frame_offset=start_frame) else: waveform, sample_rate = torchaudio.load(wav_path) waveform = waveform * (1 << 15) if resample_rate != sample_rate: waveform = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=resample_rate)(waveform) if wav_distortion_rate > 0.0: r = random.uniform(0, 1) if r < wav_distortion_rate: waveform = waveform.detach().numpy() waveform = _waveform_distortion(waveform, distortion_methods_conf) waveform = torch.from_numpy(waveform) # 语音加噪、混响等扩充 if speech_aug is not None: # 如果有多个aug的方式,会在根据random_weight选择,选择了某个加噪的方式, # 会根据prob去选择是否进行aug waveform, _ = speech_aug(waveform, torch.ones(1)) # torchaudio.save('data/1.wav',waveform,resample_rate) mat = kaldi.fbank( waveform, num_mel_bins=feature_extraction_conf['mel_bins'], frame_length=feature_extraction_conf['frame_length'], frame_shift=feature_extraction_conf['frame_shift'], dither=wav_dither, energy_floor=0.0, sample_frequency=resample_rate) mat = mat.detach().numpy() feats.append(mat) keys.append(x[0]) lengths.append(mat.shape[0]) except (Exception) as e: print(e) logging.warning('read utterance {} error'.format(x[0])) pass # Sort it because sorting is required in pack/pad operation order = np.argsort(lengths)[::-1] sorted_keys = [keys[i] for i in order] sorted_feats = [feats[i] for i in order] labels = [x[2].split() for x in batch] labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels] sorted_labels = [labels[i] for i in order] return sorted_keys, sorted_feats, sorted_labels
def _extract_feature(batch, speed_perturb, wav_distortion_conf, feature_extraction_conf): """ Extract acoustic fbank feature from origin waveform. Speed perturbation and wave amplitude distortion is optional. Args: batch: a list of tuple (wav id , wave path). speed_perturb: bool, whether or not to use speed pertubation. wav_distortion_conf: a dict , the config of wave amplitude distortion. feature_extraction_conf:a dict , the config of fbank extraction. Returns: (keys, feats, labels) """ keys = [] feats = [] lengths = [] wav_dither = wav_distortion_conf['wav_dither'] wav_distortion_rate = wav_distortion_conf['wav_distortion_rate'] distortion_methods_conf = wav_distortion_conf['distortion_methods'] if speed_perturb: speeds = [1.0, 1.1, 0.9] weights = [1, 1, 1] speed = random.choices(speeds, weights, k=1)[0] # speed = random.choice(speeds) for i, x in enumerate(batch): try: if speed_perturb: waveform, sample_rate = _load_wav_with_speed(x[1], speed) else: waveform, sample_rate = torchaudio.load_wav(x[1]) if wav_distortion_rate > 0.0: r = random.uniform(0, 1) if r < wav_distortion_rate: waveform = waveform.detach().numpy() waveform = _waveform_distortion(waveform, distortion_methods_conf) waveform = torch.from_numpy(waveform) mat = kaldi.fbank( waveform, num_mel_bins=feature_extraction_conf['mel_bins'], frame_length=feature_extraction_conf['frame_length'], frame_shift=feature_extraction_conf['frame_shift'], dither=wav_dither, energy_floor=0.0, sample_frequency=sample_rate ) mat = mat.detach().numpy() feats.append(mat) keys.append(x[0]) lengths.append(mat.shape[0]) except (Exception) as e: print(e) logging.warn('read utterance {} error'.format(x[0])) pass # Sort it because sorting is required in pack/pad operation order = np.argsort(lengths)[::-1] sorted_keys = [keys[i] for i in order] sorted_feats = [feats[i] for i in order] labels = [x[2].split() for x in batch] labels = [np.fromiter(map(int, x), dtype=np.int32) for x in labels] sorted_labels = [labels[i] for i in order] return sorted_keys, sorted_feats, sorted_labels
def __get_sample_fbank(): waveform, sample_rate = load('resource/SI1657.wav') return fbank(waveform, num_mel_bins=83, window_type=HAMMING)