def compute_fbank_pitch(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['thres_autoc'] = args.thres_autoc config['output_type'] = args.output_type fbank_pitch = FbankPitch.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) fbank_pitch_test = fbank_pitch(audio_data, args.sample_rate) sess = tf.Session() fbank_pitch_feats = fbank_pitch_test.eval(session=sess) writer[utt_id] = fbank_pitch_feats
def compute_spectrum(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['output_type'] = int(args.output_type) config['window_length'] = args.window_length config['frame_length'] = args.frame_length spectrum = Spectrum.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) spectrum_test = spectrum(audio_data, args.sample_rate) sess = tf.compat.v1.Session() spectrum_feats = spectrum_test.eval(session=sess) writer[utt_id] = spectrum_feats
def compute_mfcc(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['output_type'] = args.output_type config['window_type'] = args.window_type config['snip_edges'] = args.snip_edges config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank config['cepstral_lifter'] = args.cepstral_lifter config['coefficient_count'] = args.coefficient_count mfcc = Mfcc.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate)) sess = tf.Session() mfcc_feats = mfcc_test.eval(session=sess) writer[utt_id] = mfcc_feats
def compute_stft(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['window_length'] = args.window_length config['frame_length'] = args.frame_length stft = Analyfiltbank.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) power_spectrum, phase_spectrum = stft(audio_data, args.sample_rate) sess = tf.Session() if args.output_type == 1: out_feats = power_spectrum.eval(session=sess) else: out_feats = phase_spectrum.eval(session=sess) writer[utt_id] = out_feats
def apply_cmvn(): args = get_parser().parse_args() if ':' in args.stats_rspecifier_or_rxfilename: is_rspcifier = True stats_filetype = 'ark' stats_dict = dict(KaldiReader(args.stats_rspecifier_or_rxfilename)) else: is_rspcifier = False stats_filetype = 'mat' stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} config = {} config['norm_means'] = args.norm_means config['norm_vars'] = args.norm_vars config['utt2spk'] = args.utt2spk config['spk2utt'] = args.spk2utt config['reverse'] = args.reverse config['std_floor'] = args.std_floor config['filetype'] = stats_filetype cmvn = CMVN.params(config).instantiate() cmvn.call(stats_dict) with KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer, \ kaldiio.ReadHelper(args.rspecifier) as reader: for utt, mat in reader: mat_new = cmvn.apply_cmvn(mat, utt) writer[utt] = mat_new
def compute_pitch(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['snip_edges'] = args.snip_edges config['preemph_coeff'] = args.preemph_coeff config['min_f0'] = args.min_f0 config['max_f0'] = args.max_f0 config['soft_min_f0'] = args.soft_min_f0 config['penalty_factor'] = args.penalty_factor config['lowpass_cutoff'] = args.lowpass_cutoff config['resample_freq'] = args.resample_freq config['delta_pitch'] = args.delta_pitch config['nccf_ballast'] = args.nccf_ballast config['lowpass_filter_width'] = args.lowpass_filter_width config['upsample_filter_width'] = args.upsample_filter_width config['max_frames_latency'] = args.max_frames_latency config['frames_per_chunk'] = args.frames_per_chunk config['simulate_first_pass_online'] = args.simulate_first_pass_online config['recompute_frame'] = args.recompute_frame config['nccf_ballast_online'] = args.nccf_ballast_online pitch = Pitch.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate)) sess = tf.Session() pitch_feats = pitch_test.eval(session=sess) writer[utt_id] = pitch_feats
def compute_cmvn_stats(): """ e.g. compute_cmvn_stats.py scp:data/train/feats.scp data/train/cmvn.ark # compute global cmvn """ args = get_parser().parse_args() is_wspecifier = ':' in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info('Performing as utterance CMVN mode') def utt2spk(x): return x else: logging.info('Performing as gloabl CMVN model') if args.spk2utt is not None: logging.warning('spk2utt is not used for global CMVN mode') def utt2spk(x): return None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate(KaldiReader(args.rspecifier), 1): spk = utt2spk(utt) if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] sum_feats[spk] = np.zeros(feat_shape, dtype=np.float) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0. cmvn_stats[spk] = _cmvn_stats if is_wspecifier: with KaldiWriter(args.wspecifier_or_wxfilename) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat else: matrix = cmvn_stats[None] kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)