"ltsv-ctx-window", 50, "Context window for LTSV computation (default: 50)", ) po.register_float( "threshold", 0.01, "Parameter for sigmoid scaling in LTSV (default: 0.01)", ) po.register_float( "slope", 0.001, "Parameter for sigmoid scaling in LTSV (default: 0.001)") po.register_bool("sigmoid-scale", True, "Apply sigmoid scaling in LTSV (default: True)") po.register_int("dct-num-cep", 5, "DCT number of coefficitents (default: 5)") po.register_int("dct-ctx-window", 30, "DCT context window (default: 30)") po.register_bool("test-plot", False, "Produces a plot for testing (default: False)") opts = po.parse_args() if po.num_args() != 2: po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) feats_wspecifier = po.get_arg(2) compute_vad(wav_rspecifier, feats_wspecifier, opts)
"%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use Principal component analysis for dimension reduction. For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html Usage: pca-vector.py [options] <vector-rspecifier> <vector-wspecifier e.g. pca-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark see also: two-dim-vector-visual.py """ po = ParseOptions(usage) po.register_int( "output-dim", 2, "dimension of the output vectors." " For visualization, only 2 is allowed in this program. (2 by default)" ) opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) vector_wspecifier = po.get_arg(2) isSuccess = pca_vector(vector_rspecifier, vector_wspecifier, output_dim=opts.output_dim) if not isSuccess: sys.exit()
Posterior-formatted posterior: <uttid> [[(0,0.1), (1,0.89), (5,0.01)], [(1,0,9), (5,0.1)], ... [(0,0.8), (1,0.2)]] ... Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier e.g. feat-to-post scp:feats.scp ark:post.ark """ po = ParseOptions(usage) po.register_int("top-n", 10, "only keep highest N posteriors per frame, 10 by default") po.register_bool("rescale", False, "rescale top N posteriors to let summation equals to 1, false by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier, opts.top_n, opts.rescale) if not isSuccess: sys.exit()
Note: lattices, if output, will just be linear sequences; use gmm-latgen-faster if you want "real" lattices. """ po = ParseOptions(usage) decoder_opts = FasterDecoderOptions() decoder_opts.register(po, True) po.register_float("acoustic-scale", 0.1, "Scaling factor for acoustic likelihoods") po.register_bool("allow-partial", True, "Produce output even when final state was not reached") po.register_str("word-symbol-table", "", "Symbol table for words [for debug output]"); opts = po.parse_args() if po.num_args() < 4 or po.num_args() > 6: po.print_usage() sys.exit() model_rxfilename = po.get_arg(1) fst_rxfilename = po.get_arg(2) feature_rspecifier = po.get_arg(3) words_wspecifier = po.get_arg(4) alignment_wspecifier = po.get_opt_arg(5) lattice_wspecifier = po.get_opt_arg(6) gmm_decode_faster(model_rxfilename, fst_rxfilename, feature_rspecifier, words_wspecifier, alignment_wspecifier, lattice_wspecifier, opts.word_symbol_table, opts.acoustic_scale, opts.allow_partial, decoder_opts)
from kaldi import __version__ logging.addLevelName(20, 'LOG') logging.basicConfig( format='%(levelname)s (%(module)s[{}]:%(funcName)s():' '%(filename)s:%(lineno)s) %(message)s'.format(__version__), level=logging.INFO) usage = """Extract segments from a large audio file in WAV format. Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-segment-length", 0.1, "Minimum segment length " "in seconds (reject shorter segments)") po.register_float( "max_overshoot", 0.5, "End segments overshooting audio " "by less than this (in seconds) are truncated, " "else rejected.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) segments_rxfilename = po.get_arg(2) wav_wspecifier = po.get_arg(3) extract_segments(wav_rspecifier, segments_rxfilename, wav_wspecifier, opts)
opts = po.parse_args() if po.num_args() != 2: po.print_usage() sys.exit(1) if (opts.apply_log and opts.apply_exp) or (opts.apply_softmax_per_row and opts.apply_exp) or ( opts.apply_softmax_per_row and opts.apply_log): print( "Only one of apply-log, apply-exp and apply-softmax-per-row can be given", file=sys.stderr) sys.exit(1) matrix_in_fn = po.get_arg(1) matrix_out_fn = po.get_arg(2) in_is_rspecifier = classify_rspecifier( matrix_in_fn)[0] != RspecifierType.NO_SPECIFIER out_is_wspecifier = classify_wspecifier( matrix_out_fn)[0] != WspecifierType.NO_SPECIFIER if in_is_rspecifier != out_is_wspecifier: print("Cannot mix archives with regular files (copying matrices)", file=sys.stderr) sys.exit(1) if not in_is_rspecifier: mat = read_matrix(matrix_in_fn) if opts.scale != 1.0:
sw02005-B sw02005 B interpreted as <utterance-id> <call-id> <side> and for each <call-id> that has two sides, does the 'only-the-louder' computation, else does per-utterance stats in the normal way. Note: loudness is judged by the first feature component, either energy or c0 only applicable to MFCCs or PLPs (this code could be modified to handle filterbanks). Usage: compute-cmvn-stats-two-channel [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier> e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:- """ po = ParseOptions(usage) po.register_float( "quieter_channel_weight", 0.01, "For the quieter channel," " apply this weight to the stats, so that we still get " "stats if one channel always dominates.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit(1) reco2file_and_channel_rxfilename = po.get_arg(1) feats_rspecifier = po.get_arg(2) stats_wspecifier = po.get_arg(3) compute_cmvn_stats_two_channel(reco2file_and_channel_rxfilename, feats_rspecifier, stats_wspecifier, opts)
if __name__ == '__main__': # Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s" .format(__version__), level=logging.INFO) usage = """save the visualization plot of 2-dimensional vectors to hardisk. Usage: two-dim-vector-visual.py [options] <vector-rspecifier> <utt2spk-rxfilename> <figure-rxfilename> e.g. two-dim-vector-visual.py scp:data/train/2d_vectors.scp data/train/utt2spk data/train/2d_vectors.png """ po = ParseOptions(usage) opts = po.parse_args() if (po.num_args() != 3): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) utt2spk_rxfilename = po.get_arg(2) figure_rxfilename = po.get_arg(3) isSuccess = two_dim_vector_visual(vector_rspecifier, utt2spk_rxfilename, figure_rxfilename) if not isSuccess: sys.exit()
po = ParseOptions(usage) po.register_int( "n-clusters", 8, "The number of clusters to form as well as the number of centroids to generate. default=8" ) po.register_int( "random-state", 0, "Determines random number generation for centroid initialization and random reassignment. " "Use an int to make the randomness deterministic. ") po.register_int("batch-size", 6, "Size of the mini batches.") po.register_int( "max-iter", 100, "Maximum number of iterations over the complete dataset before stopping independently of " "any early stopping criterion heuristics.") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) utt2clusterid_rxfilename = po.get_arg(2) isSuccess = kmeans_vector(vector_rspecifier, utt2clusterid_rxfilename, n_clusters=opts.n_clusters, random_state=opts.random_state, batch_size=opts.batch_size, max_iter=opts.max_iter) if not isSuccess: sys.exit()