usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments to process in seconds (default: 0.0).", ) po.register_int( "channel", -1, "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)", ) po.register_int("frame-window", 25, "Length of frame window in ms (default: 25)") po.register_int("frame-shift", 10, "Length of frame shift in ms (default: 10)") po.register_int("nfft", 512, "Number of DFT points (default: 256)") po.register_int( "arma-order", 5, "Length of ARMA window that will be applied to the spectrogram", ) po.register_int( "ltsv-ctx-window", 50,
"%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use Principal component analysis for dimension reduction. For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html Usage: pca-vector.py [options] <vector-rspecifier> <vector-wspecifier e.g. pca-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark see also: two-dim-vector-visual.py """ po = ParseOptions(usage) po.register_int( "output-dim", 2, "dimension of the output vectors." " For visualization, only 2 is allowed in this program. (2 by default)" ) opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() vector_rspecifier = po.get_arg(1) vector_wspecifier = po.get_arg(2) isSuccess = pca_vector(vector_rspecifier, vector_wspecifier, output_dim=opts.output_dim) if not isSuccess: sys.exit()
mfcc_opts.register(po) po.register_bool( "subtract-mean", False, "Subtract mean of each feature" "file [CMS]; not recommended to do it this way.") po.register_float( "vtln-warp", 1.0, "Vtln warp factor (only applicable " "if vtln-map not specified)") po.register_str( "vtln-map", "", "Map from utterance or speaker-id to " "vtln warp factor (rspecifier)") po.register_str( "utt2spk", "", "Utterance to speaker-id map rspecifier" "(if doing VTLN and you have warps per speaker)") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) feats_wspecifier = po.get_arg(2) compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts)
if __name__ == "__main__": usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_int("frame-window", 25, "Length of frame window in ms " "default is 25ms") po.register_int("frame-shift", 10, "Length of frame shift in ms " "default is 10ms") po.register_int("nfft", 256, "Number of DFT points " "default is 256") po.register_int( "arma-order", 5, "Length of ARMA window that will be applied " "to the spectrogram") po.register_int("ltsv-ctx-window", 50, "Context window for LTSV computation " "default is 50") po.register_float( "threshold", 0.01, "Parameter for sigmoid scaling in LTSV " "default is 0.01")
Posterior-formatted posterior: <uttid> [[(0,0.1), (1,0.89), (5,0.01)], [(1,0,9), (5,0.1)], ... [(0,0.8), (1,0.2)]] ... Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier e.g. feat-to-post scp:feats.scp ark:post.ark """ po = ParseOptions(usage) po.register_int("top-n", 10, "only keep highest N posteriors per frame, 10 by default") po.register_bool("rescale", False, "rescale top N posteriors to let summation equals to 1, false by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier, opts.top_n, opts.rescale) if not isSuccess: sys.exit()
logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use MiniBatchKMeans for vector clustering. It outputs cluster assignments For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans Usage: kmeans-vector.py [options] <vector-rspecifier> <utt2clusterid-rxfilename> e.g. kmeans-vector.py scp:data/train/ivector.scp data/train/utt2clusterid """ po = ParseOptions(usage) po.register_int( "n-clusters", 8, "The number of clusters to form as well as the number of centroids to generate. default=8" ) po.register_int( "random-state", 0, "Determines random number generation for centroid initialization and random reassignment. " "Use an int to make the randomness deterministic. ") po.register_int("batch-size", 6, "Size of the mini batches.") po.register_int( "max-iter", 100, "Maximum number of iterations over the complete dataset before stopping independently of " "any early stopping criterion heuristics.") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit()
logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Use t-sne (t-distributed Stochastic Neighbor Emedding) for dimension reduction. For the details, Please refer to website: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE Usage: tsne-vector.py [options] <vector-rspecifier> <vector-wspecifier e.g. tsne-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark """ po = ParseOptions(usage) po.register_int( "output-dim", 2, "dimension of the output vectors." " For visualization, only 2 is allowed in this program. (2 by default)" ) po.register_double( "perplexity", 30, "The perplexity is related to the number of nearest neighbors that is used" " in other mainfold learning algorithms. Large datasets usually require a" " large perplexity. Consider selecting a value between 5 and 50. Different" " values can result in significantly different results. (30 by default)" ) po.register_double( "learning-rate", 200.0, "The learning rate for t-sne is usually in the range [10.0, 1000.0]. If the" " learning rate is too high, the data may look like a \'ball\' with any point" " approximately equidistant from its nearest neighbors. If the learning rate" " is too low, most points may look compressed in a dense cloud with few outliers." " If the cost function gets stuck in a bad local minimum increasing the learning"
# Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, "LOG") logging.basicConfig( format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s".format(__version__), level=logging.INFO) usage = """Convert features into posterior format, which is the generic format of NN training target in Karel's nnet1 tools. (spped is not an issue for reasonably low NN-output dimensions) Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier e.g. feat-to-post scp:feats.scp ark:post.ark """ po = ParseOptions(usage) po.register_int("top-n", 10, "N posteriors per frame, 10 by default") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() feature_rspecifier = po.get_arg(1) posterior_wspecifier = po.get_arg(2) isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier, opts.top_n) if not isSuccess: sys.exit()
file=sys.stderr) return num_success != 0 if __name__ == '__main__': usage = """Create MFCC feature files. Usage: compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) mfcc_opts = MfccOptions() mfcc_opts.register(po) po.register_int("sampling-rate", 16000, "Sampling rate of waveforms and labels.") po.register_int( "signal-window-length", 200, "Window length in ms (what will be presented to the network).") po.register_int("label-window-length", 25, "Window length of alignments / labels in ms.") po.register_int("label-window-shift", 10, "Window shift of alignments / labels in ms.") po.register_bool( "subtract-mean", False, "Subtract mean of each feature" "file [CMS]; not recommended to do it this way.") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_float( "min-duration", 0.0, "Minimum duration of segments "