Exemple #1
0
    usage = """Compute VAD.

    Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
    """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration",
        0.0,
        "Minimum duration of segments to process in seconds (default: 0.0).",
    )
    po.register_int(
        "channel",
        -1,
        "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)",
    )
    po.register_int("frame-window", 25,
                    "Length of frame window in ms (default: 25)")
    po.register_int("frame-shift", 10,
                    "Length of frame shift in ms (default: 10)")
    po.register_int("nfft", 512, "Number of DFT points (default: 256)")
    po.register_int(
        "arma-order",
        5,
        "Length of ARMA window that will be applied to the spectrogram",
    )
    po.register_int(
        "ltsv-ctx-window",
        50,
Exemple #2
0
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use Principal component analysis for dimension reduction.
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

  Usage: pca-vector.py [options] <vector-rspecifier> <vector-wspecifier

  e.g.
      pca-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark

  see also: two-dim-vector-visual.py
  """
    po = ParseOptions(usage)
    po.register_int(
        "output-dim", 2, "dimension of the output vectors."
        " For visualization, only 2 is allowed in this program. (2 by default)"
    )
    opts = po.parse_args()
    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    vector_rspecifier = po.get_arg(1)
    vector_wspecifier = po.get_arg(2)
    isSuccess = pca_vector(vector_rspecifier,
                           vector_wspecifier,
                           output_dim=opts.output_dim)
    if not isSuccess:
        sys.exit()
Exemple #3
0
    mfcc_opts.register(po)

    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"
        "file [CMS]; not recommended to do it this way.")
    po.register_float(
        "vtln-warp", 1.0, "Vtln warp factor (only applicable "
        "if vtln-map not specified)")
    po.register_str(
        "vtln-map", "", "Map from utterance or speaker-id to "
        "vtln warp factor (rspecifier)")
    po.register_str(
        "utt2spk", "", "Utterance to speaker-id map rspecifier"
        "(if doing VTLN and you have warps per speaker)")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")

    opts = po.parse_args()

    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    wav_rspecifier = po.get_arg(1)
    feats_wspecifier = po.get_arg(2)

    compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts)
Exemple #4
0

if __name__ == "__main__":

    usage = """Compute VAD.

  Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
  """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_int("frame-window", 25, "Length of frame window in ms "
                    "default is 25ms")
    po.register_int("frame-shift", 10, "Length of frame shift in ms "
                    "default is 10ms")
    po.register_int("nfft", 256, "Number of DFT points " "default is 256")
    po.register_int(
        "arma-order", 5, "Length of ARMA window that will be applied "
        "to the spectrogram")
    po.register_int("ltsv-ctx-window", 50,
                    "Context window for LTSV computation "
                    "default is 50")
    po.register_float(
        "threshold", 0.01, "Parameter for sigmoid scaling in LTSV "
        "default is 0.01")
Exemple #5
0
   Posterior-formatted posterior:
     <uttid> [[(0,0.1), (1,0.89), (5,0.01)],
              [(1,0,9), (5,0.1)],
                ...
              [(0,0.8), (1,0.2)]]
       ... 

  Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier

  e.g.
      feat-to-post scp:feats.scp ark:post.ark

  """
  po = ParseOptions(usage)
  po.register_int("top-n", 10,
                  "only keep highest N posteriors per frame, 10 by default")
  po.register_bool("rescale", False,
                   "rescale top N posteriors to let summation equals to 1, false by default")
  opts = po.parse_args()

  if (po.num_args() != 2):
    po.print_usage()
    sys.exit()

  feature_rspecifier = po.get_arg(1)
  posterior_wspecifier = po.get_arg(2)
  isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier,
                           opts.top_n, opts.rescale)
  if not isSuccess:
    sys.exit()
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use MiniBatchKMeans for vector clustering. It outputs cluster assignments
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans

  Usage: kmeans-vector.py [options] <vector-rspecifier> <utt2clusterid-rxfilename>

  e.g.
      kmeans-vector.py scp:data/train/ivector.scp data/train/utt2clusterid
  """
    po = ParseOptions(usage)
    po.register_int(
        "n-clusters", 8,
        "The number of clusters to form as well as the number of centroids to generate. default=8"
    )
    po.register_int(
        "random-state", 0,
        "Determines random number generation for centroid initialization and random reassignment. "
        "Use an int to make the randomness deterministic. ")
    po.register_int("batch-size", 6, "Size of the mini batches.")
    po.register_int(
        "max-iter", 100,
        "Maximum number of iterations over the complete dataset before stopping independently of "
        "any early stopping criterion heuristics.")
    opts = po.parse_args()

    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()
Exemple #7
0
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use t-sne (t-distributed Stochastic Neighbor Emedding) for dimension reduction.
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE

  Usage: tsne-vector.py [options] <vector-rspecifier> <vector-wspecifier

  e.g.
      tsne-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark
  """
    po = ParseOptions(usage)
    po.register_int(
        "output-dim", 2, "dimension of the output vectors."
        " For visualization, only 2 is allowed in this program. (2 by default)"
    )
    po.register_double(
        "perplexity", 30,
        "The perplexity is related to the number of nearest neighbors that is used"
        " in other mainfold learning algorithms. Large datasets usually require a"
        " large perplexity. Consider selecting a value between 5 and 50. Different"
        " values can result in significantly different results. (30 by default)"
    )
    po.register_double(
        "learning-rate", 200.0,
        "The learning rate for t-sne is usually in the range [10.0, 1000.0]. If the"
        " learning rate is too high, the data may look like a \'ball\' with any point"
        " approximately equidistant from its nearest neighbors. If the learning rate"
        " is too low, most points may look compressed in a dense cloud with few outliers."
        " If the cost function gets stuck in a bad local minimum increasing the learning"
    # Configure log messages to look like Kaldi messages
    from kaldi import __version__
    logging.addLevelName(20, "LOG")
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Convert features into posterior format, which is the generic format
  of NN training target in Karel's nnet1 tools.
  (spped is not an issue for reasonably low NN-output dimensions)
  Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier

  e.g.
      feat-to-post scp:feats.scp ark:post.ark

  """
    po = ParseOptions(usage)
    po.register_int("top-n", 10, "N posteriors per frame, 10 by default")
    opts = po.parse_args()

    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    feature_rspecifier = po.get_arg(1)
    posterior_wspecifier = po.get_arg(2)
    isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier,
                             opts.top_n)
    if not isSuccess:
        sys.exit()
          file=sys.stderr)

    return num_success != 0


if __name__ == '__main__':
    usage = """Create MFCC feature files.

    Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>
    """
    po = ParseOptions(usage)

    mfcc_opts = MfccOptions()
    mfcc_opts.register(po)

    po.register_int("sampling-rate", 16000,
                    "Sampling rate of waveforms and labels.")
    po.register_int(
        "signal-window-length", 200,
        "Window length in ms (what will be presented to the network).")
    po.register_int("label-window-length", 25,
                    "Window length of alignments / labels in ms.")
    po.register_int("label-window-shift", 10,
                    "Window shift of alignments / labels in ms.")
    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"
        "file [CMS]; not recommended to do it this way.")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "