def load_model(config_file, online_config, models_path='models/', beam_size=10, frames_per_chunk=50): # Read YAML file with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] print(decoder_yaml_opts) feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() if not os.path.isfile(online_config): print(online_config + ' does not exists. Trying to create it from yaml file settings.') print( 'See also online_config_options.info.txt for what possible settings are.' ) with open(online_config, 'w') as online_config_file: online_config_file.write("--add_pitch=False\n") online_config_file.write("--mfcc_config=" + models_path + decoder_yaml_opts['mfcc-config'] + "\n") online_config_file.write("--feature_type=mfcc\n") online_config_file.write( "--ivector_extraction_config=" + models_path + decoder_yaml_opts['ivector-extraction-config'] + '\n') online_config_file.write( "--endpoint.silence-phones=" + decoder_yaml_opts['endpoint-silence-phones'] + '\n') else: print("Loading online conf from:", online_config) po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file(online_config) feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = beam_size decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = frames_per_chunk asr = NnetLatticeFasterOnlineRecognizer.from_files( models_path + decoder_yaml_opts["model"], models_path + decoder_yaml_opts["fst"], models_path + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts, endpoint_opts=endpoint_opts) return asr, feat_info, decodable_opts
def LoadModels(self): try: # Define online feature pipeline po = ParseOptions("") decoder_opts = LatticeFasterDecoderOptions() self.endpoint_opts = OnlineEndpointConfig() self.decodable_opts = NnetSimpleLoopedComputationOptions() feat_opts = OnlineNnetFeaturePipelineConfig() decoder_opts.register(po) self.endpoint_opts.register(po) self.decodable_opts.register(po) feat_opts.register(po) po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf") self.feat_info = OnlineNnetFeaturePipelineInfo.from_config( feat_opts) # Set metadata parameters self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000 self.acwt = self.decodable_opts.acoustic_scale # Load Acoustic and graph models and other files self.transition_model, self.acoustic_model = NnetRecognizer.read_model( self.AM_PATH + "/final.mdl") graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst") self.decoder_graph = LatticeFasterOnlineDecoder( graph, decoder_opts) self.symbols = _fst.SymbolTable.read_text(self.LM_PATH + "/words.txt") self.info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int") self.asr = NnetLatticeFasterOnlineRecognizer( self.transition_model, self.acoustic_model, self.decoder_graph, self.symbols, decodable_opts=self.decodable_opts, endpoint_opts=self.endpoint_opts) del graph, decoder_opts except Exception as e: self.log.error(e) raise ValueError( "AM and LM loading failed!!! (see logs for more details)")
def extract_spec(filename, samp_freq, frame_length_ms=25, frame_shift_ms=10, round_to_power_of_two=True, snip_edges=True): ''' extract spectrogram using kaldi args: filename: wav file path samp_freq: sample frequence return: spectrogram: (frame, fre) ''' # get rspec and wspec with open('wav.scp', 'w') as f: f.write('test1 ' + filename + '\n') rspec = 'scp,p:' + 'wav.scp' wspec = 'ark,t:' + 'spec.ark' # set po usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>""" po = ParseOptions(usage) po.register_float("min-duration", 0.0, "minimum segment duration") opts = po.parse_args() # set options spec_opts = SpectrogramOptions() spec_opts.frame_opts.samp_freq = samp_freq spec_opts.frame_opts.frame_length_ms = frame_length_ms spec_opts.frame_opts.frame_shift_ms = frame_shift_ms spec_opts.frame_opts.round_to_power_of_two = round_to_power_of_two spec_opts.frame_opts.snip_edges = snip_edges spec_opts.register(po) spec = Spectrogram(spec_opts) sf = spec_opts.frame_opts.samp_freq with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer: for key, wav in reader: if wav.duration < opts.min_duration: continue assert (wav.samp_freq >= sf) assert (wav.samp_freq % sf == 0) s = wav.data() s = s[:, ::int(wav.samp_freq / sf)] m = SubVector(mean(s, axis=0)) f = spec.compute_features(m, sf, 1.0) f_array = np.array(f) writer[key] = f return f_array
def compute_feat_KALDI(self, wav): try: po = ParseOptions("") mfcc_opts = MfccOptions() mfcc_opts.use_energy = False mfcc_opts.frame_opts.samp_freq = self.sr mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000 mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000 mfcc_opts.frame_opts.allow_downsample = False mfcc_opts.mel_opts.num_bins = self.num_bins mfcc_opts.mel_opts.low_freq = self.low_freq mfcc_opts.mel_opts.high_freq = self.high_freq mfcc_opts.num_ceps = self.num_ceps mfcc_opts.register(po) # Create MFCC object and obtain sample frequency mfccObj = Mfcc(mfcc_opts) mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0) except Exception as e: self.log.error(e) raise ValueError( "Speaker diarization failed while extracting features!!!") else: return mfccKaldi
... Posterior-formatted posterior: <uttid> [[(0,0.1), (1,0.89), (5,0.01)], [(1,0,9), (5,0.1)], ... [(0,0.8), (1,0.2)]] ... Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier e.g. feat-to-post scp:feats.scp ark:post.ark """ po = ParseOptions(usage) po.register_int("top-n", 10, "only keep highest N posteriors per frame, 10 by default") po.register_bool("rescale", False, "rescale top N posteriors to let summation equals to 1, false by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier, opts.top_n, opts.rescale) if not isSuccess:
print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) if opts.vtln_map: vtln_map_reader.close() return num_success != 0 if __name__ == '__main__': usage = """Create MFCC feature files. Usage: compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) mfcc_opts = MfccOptions() mfcc_opts.register(po) po.register_bool( "subtract-mean", False, "Subtract mean of each feature" "file [CMS]; not recommended to do it this way.") po.register_float( "vtln-warp", 1.0, "Vtln warp factor (only applicable " "if vtln-map not specified)") po.register_str( "vtln-map", "", "Map from utterance or speaker-id to " "vtln warp factor (rspecifier)") po.register_str( "utt2spk", "", "Utterance to speaker-id map rspecifier"
# Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Convert features into posterior format, which is the generic format of NN training target in Karel's nnet1 tools. (spped is not an issue for reasonably low NN-output dimensions) Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier e.g. feat-to-post scp:feats.scp ark:post.ark """ po = ParseOptions(usage) po.register_int("top-n", 10, "N posteriors per frame, 10 by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier, opts.top_n) if not isSuccess: sys.exit()
if __name__ == '__main__': # Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, 'LOG') logging.basicConfig( format='%(levelname)s (%(module)s[{}]:%(funcName)s():' '%(filename)s:%(lineno)s) %(message)s'.format(__version__), level=logging.INFO) usage = """Extract segments from a large audio file in WAV format. Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-segment-length", 0.1, "Minimum segment length " "in seconds (reject shorter segments)") po.register_float( "max_overshoot", 0.5, "End segments overshooting audio " "by less than this (in seconds) are truncated, " "else rejected.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) segments_rxfilename = po.get_arg(2)
from kaldi.decoder import LatticeFasterDecoderOptions from kaldi.nnet3 import NnetSimpleLoopedComputationOptions from kaldi.online2 import (OnlineEndpointConfig, OnlineIvectorExtractorAdaptationState, OnlineNnetFeaturePipelineConfig, OnlineNnetFeaturePipelineInfo, OnlineNnetFeaturePipeline, OnlineSilenceWeighting) from kaldi.util.options import ParseOptions from kaldi.util.table import SequentialWaveReader chunk_size = 1440 # Define online feature pipeline feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file("online.conf") feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterOnlineRecognizer.from_files( "final.mdl",
print( "Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr, ) return num_success != 0 if __name__ == "__main__": usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments to process in seconds (default: 0.0).", ) po.register_int( "channel", -1, "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)", ) po.register_int("frame-window", 25, "Length of frame window in ms (default: 25)") po.register_int("frame-shift", 10, "Length of frame shift in ms (default: 10)")
side's stats). Reads a 'reco2file_and_channel' file, normally like sw02001-A sw02001 A sw02001-B sw02001 B sw02005-A sw02005 A sw02005-B sw02005 B interpreted as <utterance-id> <call-id> <side> and for each <call-id> that has two sides, does the 'only-the-louder' computation, else does per-utterance stats in the normal way. Note: loudness is judged by the first feature component, either energy or c0 only applicable to MFCCs or PLPs (this code could be modified to handle filterbanks). Usage: compute-cmvn-stats-two-channel [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier> e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:- """ po = ParseOptions(usage) po.register_float( "quieter_channel_weight", 0.01, "For the quieter channel," " apply this weight to the stats, so that we still get " "stats if one channel always dominates.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit(1) reco2file_and_channel_rxfilename = po.get_arg(1) feats_rspecifier = po.get_arg(2) stats_wspecifier = po.get_arg(3)
# Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s" .format(__version__), level=logging.INFO) usage = """Decode features using GMM-based model. Usage: gmm-decode-faster.py [options] model-in fst-in features-rspecifier words-wspecifier [alignments-wspecifier [lattice-wspecifier]] Note: lattices, if output, will just be linear sequences; use gmm-latgen-faster if you want "real" lattices. """ po = ParseOptions(usage) decoder_opts = FasterDecoderOptions() decoder_opts.register(po, True) po.register_float("acoustic-scale", 0.1, "Scaling factor for acoustic likelihoods") po.register_bool("allow-partial", True, "Produce output even when final state was not reached") po.register_str("word-symbol-table", "", "Symbol table for words [for debug output]"); opts = po.parse_args() if po.num_args() < 4 or po.num_args() > 6: po.print_usage() sys.exit() model_rxfilename = po.get_arg(1)
help="apply cepstrum mean normalizaiton per utterance") parser.add_argument('--sample_rate', type=int, default=16000, help='sample rate of waves') parser.add_argument('--feat_config', type=str, default=None, help='feature extraction config file') parser.add_argument('--feat_dim', type=int, default=80, help='feature dimension') args, unk = parser.parse_known_args() po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) speed_rate = [0.9, 1.0, 1.1] cmvn = Cmvn(args.feat_dim) with open(args.data_lst, 'r', encoding='utf-8') as data_lst_f: for line in data_lst_f: mrk_fn = line.split()[0] seq_fn = line.split()[1] with open(mrk_fn, 'r', encoding='utf-8') as mrk, \ open(seq_fn, 'rb') as seq: for mrk_line in mrk: seq.seek(int(mrk_line.split()[1]))
# ============================================================================= # ----------------------------- Model Loading # ============================================================================= log_file = open(log_filepath, "w") summ_file = open(summ_filepath, "w") chunk_size = 1440 # Define online feature pipeline #feats_args = "--mfcc-config=" + mfcc_hires_path + " " +\ # "--ivector-extraction-config=" + ivector_extractor_path +\ # "-verbose=1" feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file(online_config_path) feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 print('Loading inference model from files\n {} \n {} \n {}\n'\
if __name__ == '__main__': # Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s" .format(__version__), level=logging.INFO) usage = """save the visualization plot of 2-dimensional vectors to hardisk. Usage: two-dim-vector-visual.py [options] <vector-rspecifier> <utt2spk-rxfilename> <figure-rxfilename> e.g. two-dim-vector-visual.py scp:data/train/2d_vectors.scp data/train/utt2spk data/train/2d_vectors.png """ po = ParseOptions(usage) opts = po.parse_args() if (po.num_args() != 3): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) utt2spk_rxfilename = po.get_arg(2) figure_rxfilename = po.get_arg(3) isSuccess = two_dim_vector_visual(vector_rspecifier, utt2spk_rxfilename, figure_rxfilename) if not isSuccess: sys.exit()
from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use MiniBatchKMeans for vector clustering. It outputs cluster assignments For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans Usage: kmeans-vector.py [options] <vector-rspecifier> <utt2clusterid-rxfilename> e.g. kmeans-vector.py scp:data/train/ivector.scp data/train/utt2clusterid """ po = ParseOptions(usage) po.register_int( "n-clusters", 8, "The number of clusters to form as well as the number of centroids to generate. default=8" ) po.register_int( "random-state", 0, "Determines random number generation for centroid initialization and random reassignment. " "Use an int to make the randomness deterministic. ") po.register_int("batch-size", 6, "Size of the mini batches.") po.register_int( "max-iter", 100, "Maximum number of iterations over the complete dataset before stopping independently of " "any early stopping criterion heuristics.") opts = po.parse_args()
from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use t-sne (t-distributed Stochastic Neighbor Emedding) for dimension reduction. For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE Usage: tsne-vector.py [options] <vector-rspecifier> <vector-wspecifier e.g. tsne-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark """ po = ParseOptions(usage) po.register_int( "output-dim", 2, "dimension of the output vectors." " For visualization, only 2 is allowed in this program. (2 by default)" ) po.register_double( "perplexity", 30, "The perplexity is related to the number of nearest neighbors that is used" " in other mainfold learning algorithms. Large datasets usually require a" " large perplexity. Consider selecting a value between 5 and 50. Different" " values can result in significantly different results. (30 by default)" ) po.register_double( "learning-rate", 200.0, "The learning rate for t-sne is usually in the range [10.0, 1000.0]. If the" " learning rate is too high, the data may look like a \'ball\' with any point"
def otf_utt_generator(data_triplets, rir, noise, args): """ Args: data_lst: list of mrk and seq of input audios, and label ark rir: list of rir, List[AudioSegment] noise: list of noise, List[AudioSegment] args: argumnets for loader """ max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 #rates for speed perturbation speed_rate = [float(rate) for rate in args.speed_rate.split(',')] #volume level perturbation gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')] #snr range for noise perturbation: 0-20db with mean of 10 #mu, sigma = 10, 10 #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma #Fbank config po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) #fbank_opt = MfccOptions() #fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) #fbank = Mfcc(fbank_opt) for data_triplet in data_triplets: mrk_fn, seq_fn = data_triplet[0], data_triplet[1] ali_rspec = data_triplet[2] with open(mrk_fn, 'r', encoding='utf-8') as mrk,\ open(seq_fn, 'rb') as seq: ali_reader = SequentialIntVectorReader(ali_rspec) for line, (uttid1, ali) in zip(mrk, ali_reader): uttid = line.split()[0] assert uttid == uttid1 seq.seek(int(line.split()[1])) num_bytes = int(line.split()[2]) num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') #data augmentation function goes here audio_seg = AudioSegment(audio_np, args.sample_rate) #speed perturbation spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) audio_seg.normalize(np.random.uniform(gain_lo, gain_hi)) #noise adding example: #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1) #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr) #rir adding example: #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)]) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) ali = np.array(ali) if args.reverse_labels: ali = ali[::-1] if args.SOS >= 0: ali = np.concatenate(([args.SOS], ali)) if args.EOS >= 0: ali = np.concatenate((ali, [args.EOS])) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) #limits on T*U products due to RNNT. #this is pretty hacky now if ali.shape[0] * utt_len // 3 <= args.TU_limit: ali_len[valid_idx] = ali.shape[0] data_buffer[valid_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[valid_idx, :ali_len[valid_idx]] = ali len_buffer[valid_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[valid_idx] > target_max_len: target_max_len = ali_len[valid_idx] valid_idx += 1 batch_idx += 1 if batch_idx == batch_size: for b in range(valid_idx): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding if utt_len > 0: data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = \ args.padding_tgt data = data_buffer[:valid_idx, :batch_max_len, :] target = target_buffer[:valid_idx, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = torch.from_numpy(np.copy(data)) target = torch.from_numpy(np.copy(target)) lens = torch.from_numpy(np.copy(len_buffer[:valid_idx])) ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx])) if valid_idx > 0: #not doing cuda() here, in main process instead yield data, target, lens, ali_lens else: yield None, None, \ torch.IntTensor([0]), torch.IntTensor([0]) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 ali_reader.close() yield None
if __name__ == '__main__': # Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s" .format(__version__), level=logging.INFO) usage = """Compute the counts of *feature-formatted* posterior for each mixture. If --normalize=True and --per-utt=False, the counts will be averaged by the number of utterances. Usage: post-count.py [options] feature_rspecifier posteriors_wspecifier e.g. post-count scp:feats.scp ark,t:count.txt """ po = ParseOptions(usage) po.register_bool("normalize", False, "normalize the counts, False by default") po.register_bool("per-utt", False, "Count per utterance, False by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = post_to_count(feature_rspecifier, posterior_wspecifier, normalize=opts.normalize, per_utt=opts.per_utt) if not isSuccess: sys.exit()
file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) return num_success != 0 if __name__ == "__main__": usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_int("frame-window", 25, "Length of frame window in ms " "default is 25ms") po.register_int("frame-shift", 10, "Length of frame shift in ms " "default is 10ms") po.register_int("nfft", 256, "Number of DFT points " "default is 256") po.register_int( "arma-order", 5, "Length of ARMA window that will be applied " "to the spectrogram")
if __name__ == '__main__': usage = """Copy matrices, or archives of matrices (e.g. features or transforms) Also see copy-feats which has other format options Usage: copy-matrix [options] <matrix-in-rspecifier> <matrix-out-wspecifier> or copy-matrix [options] <matrix-in-rxfilename> <matrix-out-wxfilename> e.g. copy-matrix --binary=false 1.mat - copy-matrix ark:2.trans ark,t:- """ po = ParseOptions(usage) po.register_bool( "binary", True, "Write in binary mode (only relevant if output is a wxfilename)") po.register_float( "scale", 1.0, "This option can be used to scale the matrices being copied.") po.register_bool( "apply-log", False, "This option can be used to apply log on the matrices. Must be avoided if matrix has negative quantities." ) po.register_bool("apply-exp", False, "This option can be used to apply exp on the matrices") po.register_float( "apply-power", 1.0,
logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use Principal component analysis for dimension reduction. For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html Usage: pca-vector.py [options] <vector-rspecifier> <vector-wspecifier e.g. pca-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark see also: two-dim-vector-visual.py """ po = ParseOptions(usage) po.register_int( "output-dim", 2, "dimension of the output vectors." " For visualization, only 2 is allowed in this program. (2 by default)" ) opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) vector_wspecifier = po.get_arg(2) isSuccess = pca_vector(vector_rspecifier, vector_wspecifier, output_dim=opts.output_dim) if not isSuccess:
if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) return num_success != 0 if __name__ == '__main__': usage = """Create MFCC feature files. Usage: compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) mfcc_opts = MfccOptions() mfcc_opts.register(po) po.register_int("sampling-rate", 16000, "Sampling rate of waveforms and labels.") po.register_int( "signal-window-length", 200, "Window length in ms (what will be presented to the network).") po.register_int("label-window-length", 25, "Window length of alignments / labels in ms.") po.register_int("label-window-shift", 10, "Window shift of alignments / labels in ms.") po.register_bool( "subtract-mean", False, "Subtract mean of each feature"