def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: """Get mel-filter bank features via PyKaldi.""" try: from kaldi.feat.mel import MelBanksOptions from kaldi.feat.fbank import FbankOptions, Fbank from kaldi.feat.window import FrameExtractionOptions from kaldi.matrix import Vector mel_opts = MelBanksOptions() mel_opts.num_bins = n_bins frame_opts = FrameExtractionOptions() frame_opts.samp_freq = sample_rate opts = FbankOptions() opts.mel_opts = mel_opts opts.frame_opts = frame_opts fbank = Fbank(opts=opts) features = fbank.compute(Vector(waveform), 1.0).numpy() return features except ImportError: return None
def otf_utt_generator(data_triplets, rir, noise, args): """ Args: data_lst: list of mrk and seq of input audios, and label ark rir: list of rir, List[AudioSegment] noise: list of noise, List[AudioSegment] args: argumnets for loader """ max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 #rates for speed perturbation speed_rate = [float(rate) for rate in args.speed_rate.split(',')] #volume level perturbation gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')] #snr range for noise perturbation: 0-20db with mean of 10 #mu, sigma = 10, 10 #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma #Fbank config po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) #fbank_opt = MfccOptions() #fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) #fbank = Mfcc(fbank_opt) for data_triplet in data_triplets: mrk_fn, seq_fn = data_triplet[0], data_triplet[1] ali_rspec = data_triplet[2] with open(mrk_fn, 'r', encoding='utf-8') as mrk,\ open(seq_fn, 'rb') as seq: ali_reader = SequentialIntVectorReader(ali_rspec) for line, (uttid1, ali) in zip(mrk, ali_reader): uttid = line.split()[0] assert uttid == uttid1 seq.seek(int(line.split()[1])) num_bytes = int(line.split()[2]) num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') #data augmentation function goes here audio_seg = AudioSegment(audio_np, args.sample_rate) #speed perturbation spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) audio_seg.normalize(np.random.uniform(gain_lo, gain_hi)) #noise adding example: #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1) #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr) #rir adding example: #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)]) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) ali = np.array(ali) if args.reverse_labels: ali = ali[::-1] if args.SOS >= 0: ali = np.concatenate(([args.SOS], ali)) if args.EOS >= 0: ali = np.concatenate((ali, [args.EOS])) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) #limits on T*U products due to RNNT. #this is pretty hacky now if ali.shape[0] * utt_len // 3 <= args.TU_limit: ali_len[valid_idx] = ali.shape[0] data_buffer[valid_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[valid_idx, :ali_len[valid_idx]] = ali len_buffer[valid_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[valid_idx] > target_max_len: target_max_len = ali_len[valid_idx] valid_idx += 1 batch_idx += 1 if batch_idx == batch_size: for b in range(valid_idx): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding if utt_len > 0: data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = \ args.padding_tgt data = data_buffer[:valid_idx, :batch_max_len, :] target = target_buffer[:valid_idx, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = torch.from_numpy(np.copy(data)) target = torch.from_numpy(np.copy(target)) lens = torch.from_numpy(np.copy(len_buffer[:valid_idx])) ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx])) if valid_idx > 0: #not doing cuda() here, in main process instead yield data, target, lens, ali_lens else: yield None, None, \ torch.IntTensor([0]), torch.IntTensor([0]) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 ali_reader.close() yield None
help='sample rate of waves') parser.add_argument('--feat_config', type=str, default=None, help='feature extraction config file') parser.add_argument('--feat_dim', type=int, default=80, help='feature dimension') args, unk = parser.parse_known_args() po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) speed_rate = [0.9, 1.0, 1.1] cmvn = Cmvn(args.feat_dim) with open(args.data_lst, 'r', encoding='utf-8') as data_lst_f: for line in data_lst_f: mrk_fn = line.split()[0] seq_fn = line.split()[1] with open(mrk_fn, 'r', encoding='utf-8') as mrk, \ open(seq_fn, 'rb') as seq: for mrk_line in mrk: seq.seek(int(mrk_line.split()[1])) num_bytes = int(mrk_line.split()[2]) #this is making sure even number of bytes num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes)