import sys import argparse import numpy as np from kaldi.util.table import SequentialWaveReader from kaldi.matrix import Matrix, _matrix_ext if __name__ == '__main__': parser = argparse.ArgumentParser(description='wav.scp to byte files, i.e.,' 'each line: uttid num_bytes') parser.add_argument('wav_rspecifier', type=str, help='input wav.scp filename') parser.add_argument('byte_file', type=str, help='input wav.scp filename') args, unk = parser.parse_known_args() wav_reader = SequentialWaveReader(args.wav_rspecifier) with open(args.byte_file, 'w') as bf: for uttid, wave in wav_reader: wave_data = _matrix_ext.matrix_to_numpy(wave.data()) #has to be one channel assert wave_data.shape[0] == 1 bf.write('{} {}\n'.format(uttid, 2*len(wave_data[0].astype('int16'))))
def otf_utt_generator(data_triplets, rir, noise, args): """ Args: data_lst: list of mrk and seq of input audios, and label ark rir: list of rir, List[AudioSegment] noise: list of noise, List[AudioSegment] args: argumnets for loader """ max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 #rates for speed perturbation speed_rate = [float(rate) for rate in args.speed_rate.split(',')] #volume level perturbation gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')] #snr range for noise perturbation: 0-20db with mean of 10 #mu, sigma = 10, 10 #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma #Fbank config po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) #fbank_opt = MfccOptions() #fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) #fbank = Mfcc(fbank_opt) for data_triplet in data_triplets: mrk_fn, seq_fn = data_triplet[0], data_triplet[1] ali_rspec = data_triplet[2] with open(mrk_fn, 'r', encoding='utf-8') as mrk,\ open(seq_fn, 'rb') as seq: ali_reader = SequentialIntVectorReader(ali_rspec) for line, (uttid1, ali) in zip(mrk, ali_reader): uttid = line.split()[0] assert uttid == uttid1 seq.seek(int(line.split()[1])) num_bytes = int(line.split()[2]) num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') #data augmentation function goes here audio_seg = AudioSegment(audio_np, args.sample_rate) #speed perturbation spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) audio_seg.normalize(np.random.uniform(gain_lo, gain_hi)) #noise adding example: #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1) #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr) #rir adding example: #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)]) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) ali = np.array(ali) if args.reverse_labels: ali = ali[::-1] if args.SOS >= 0: ali = np.concatenate(([args.SOS], ali)) if args.EOS >= 0: ali = np.concatenate((ali, [args.EOS])) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) #limits on T*U products due to RNNT. #this is pretty hacky now if ali.shape[0] * utt_len // 3 <= args.TU_limit: ali_len[valid_idx] = ali.shape[0] data_buffer[valid_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[valid_idx, :ali_len[valid_idx]] = ali len_buffer[valid_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[valid_idx] > target_max_len: target_max_len = ali_len[valid_idx] valid_idx += 1 batch_idx += 1 if batch_idx == batch_size: for b in range(valid_idx): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding if utt_len > 0: data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = \ args.padding_tgt data = data_buffer[:valid_idx, :batch_max_len, :] target = target_buffer[:valid_idx, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = torch.from_numpy(np.copy(data)) target = torch.from_numpy(np.copy(target)) lens = torch.from_numpy(np.copy(len_buffer[:valid_idx])) ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx])) if valid_idx > 0: #not doing cuda() here, in main process instead yield data, target, lens, ali_lens else: yield None, None, \ torch.IntTensor([0]), torch.IntTensor([0]) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 ali_reader.close() yield None
def ctc_utt_generator(align_rspec, feats_rspec, shuffle, args): """ we do not really need 'target' generated in MMI/sMBR training from this generator so the interface is adjusted to fullfill warp_ctc for CTC training, target is now a tuple of (label, label_size). """ ali_reader = SequentialIntVectorReader(align_rspec) feats_reader = SequentialMatrixReader(feats_rspec) max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size * max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) start_flag = torch.IntTensor([1] * batch_size) if args.cuda: start_flag = start_flag.cuda(args.local_rank) batch_idx = 0 target_len = 0 batch_max_len = -1 #!!!make sure feature and ali #!!!has exact the same order for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader): assert uttid2 == uttid ali = np.array(ali) feats = _matrix_ext.matrix_to_numpy(feats) #in CTC training, the ali is shorter utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) assert ali.shape[0] <= utt_len ali_len[batch_idx] = ali.shape[0] data_buffer[batch_idx, :utt_len, :] = splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[target_len:target_len + ali_len[batch_idx]] = ali target_len += ali_len[batch_idx] len_buffer[batch_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len batch_idx += 1 if batch_idx == batch_size: for b in range(batch_size): utt_len = len_buffer[b] data_buffer[b, utt_len:batch_max_len, :] = 0 #target_buffer[b, ali_len[b]:batch_max_len] = -1 data = data_buffer[:, :batch_max_len, :] target = target_buffer[:target_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) #target = np.transpose(target, (1, 0)) data = np.copy(data) target = np.copy(target) lens = np.copy(len_buffer) ali_lens = np.copy(ali_len) data = torch.from_numpy(data) target = torch.from_numpy(target) if args.cuda: data, target = data.cuda(args.local_rank), target yield Variable(data), (Variable(target), ali_lens), lens, start_flag batch_idx = 0 target_len = 0 batch_max_len = -1 yield None
mrk_fn = line.split()[0] seq_fn = line.split()[1] with open(mrk_fn, 'r', encoding='utf-8') as mrk, \ open(seq_fn, 'rb') as seq: for mrk_line in mrk: seq.seek(int(mrk_line.split()[1])) num_bytes = int(mrk_line.split()[2]) #this is making sure even number of bytes num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') audio_seg = AudioSegment(audio_np, args.sample_rate) spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) #-55 to -10 db audio_seg.normalize(np.random.uniform(-55, -10)) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) if args.cmn: feats = _matrix_ext.matrix_to_numpy(feats) feats -= np.mean(feats, axis=0) feats = Matrix(feats) cmvn.accumulate(feats) cmvn.write_stats(args.cmvn_stats, binary=False)
def utt_generator(align_rspec, feats_rspec, shuffle, args): """ Args: align_rspec: kaldi style read rspecifier for alignment feats_rspec: kaldi stule read rspecifier for feature shuffle: deprecated args: arguments """ ali_reader = SequentialIntVectorReader(align_rspec) feats_reader = SequentialMatrixReader(feats_rspec) max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) start_flag = torch.IntTensor([1] * batch_size) if args.cuda: start_flag = start_flag.cuda(args.local_rank) batch_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader): assert uttid2 == uttid ali = np.array(ali) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + int( feats.shape[0] % args.stride != 0) #ali/targets should be shorter #assert ali.shape[0] <= utt_len ali_len[batch_idx] = ali.shape[0] data_buffer[batch_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[batch_idx, :ali_len[batch_idx]] = ali #target_len += ali_len[batch_idx] len_buffer[batch_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[batch_idx] > target_max_len: target_max_len = ali_len[batch_idx] batch_idx += 1 if batch_idx == batch_size: for b in range(batch_size): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = args.padding_tgt data = data_buffer[:, :batch_max_len, :] target = target_buffer[:, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = np.copy(data) target = np.copy(target) lens = np.copy(len_buffer) ali_lens = np.copy(ali_len) data = torch.from_numpy(data) target = torch.from_numpy(target).long() if args.cuda: data = data.cuda(args.local_rank) target = target.cuda(args.local_rank) yield Variable(data), Variable(target), lens, ali_lens batch_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 yield None