def load_model(config_file,
               online_config,
               models_path='models/',
               beam_size=10,
               frames_per_chunk=50):
    # Read YAML file
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)

    decoder_yaml_opts = model_yaml['decoder']

    print(decoder_yaml_opts)

    feat_opts = OnlineNnetFeaturePipelineConfig()
    endpoint_opts = OnlineEndpointConfig()

    if not os.path.isfile(online_config):
        print(online_config +
              ' does not exists. Trying to create it from yaml file settings.')
        print(
            'See also online_config_options.info.txt for what possible settings are.'
        )
        with open(online_config, 'w') as online_config_file:
            online_config_file.write("--add_pitch=False\n")
            online_config_file.write("--mfcc_config=" + models_path +
                                     decoder_yaml_opts['mfcc-config'] + "\n")
            online_config_file.write("--feature_type=mfcc\n")
            online_config_file.write(
                "--ivector_extraction_config=" + models_path +
                decoder_yaml_opts['ivector-extraction-config'] + '\n')
            online_config_file.write(
                "--endpoint.silence-phones=" +
                decoder_yaml_opts['endpoint-silence-phones'] + '\n')
    else:
        print("Loading online conf from:", online_config)

    po = ParseOptions("")
    feat_opts.register(po)
    endpoint_opts.register(po)
    po.read_config_file(online_config)
    feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = beam_size
    decoder_opts.max_active = 7000
    decodable_opts = NnetSimpleLoopedComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = frames_per_chunk
    asr = NnetLatticeFasterOnlineRecognizer.from_files(
        models_path + decoder_yaml_opts["model"],
        models_path + decoder_yaml_opts["fst"],
        models_path + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts,
        decodable_opts=decodable_opts,
        endpoint_opts=endpoint_opts)

    return asr, feat_info, decodable_opts
Exemple #2
0
    def LoadModels(self):
        try:
            # Define online feature pipeline
            po = ParseOptions("")

            decoder_opts = LatticeFasterDecoderOptions()
            self.endpoint_opts = OnlineEndpointConfig()
            self.decodable_opts = NnetSimpleLoopedComputationOptions()
            feat_opts = OnlineNnetFeaturePipelineConfig()

            decoder_opts.register(po)
            self.endpoint_opts.register(po)
            self.decodable_opts.register(po)
            feat_opts.register(po)

            po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf")
            self.feat_info = OnlineNnetFeaturePipelineInfo.from_config(
                feat_opts)

            # Set metadata parameters
            self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq
            self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000
            self.acwt = self.decodable_opts.acoustic_scale

            # Load Acoustic and graph models and other files
            self.transition_model, self.acoustic_model = NnetRecognizer.read_model(
                self.AM_PATH + "/final.mdl")
            graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst")
            self.decoder_graph = LatticeFasterOnlineDecoder(
                graph, decoder_opts)
            self.symbols = _fst.SymbolTable.read_text(self.LM_PATH +
                                                      "/words.txt")
            self.info = WordBoundaryInfo.from_file(
                WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int")

            self.asr = NnetLatticeFasterOnlineRecognizer(
                self.transition_model,
                self.acoustic_model,
                self.decoder_graph,
                self.symbols,
                decodable_opts=self.decodable_opts,
                endpoint_opts=self.endpoint_opts)
            del graph, decoder_opts
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "AM and LM loading failed!!! (see logs for more details)")
def extract_spec(filename,
                 samp_freq,
                 frame_length_ms=25,
                 frame_shift_ms=10,
                 round_to_power_of_two=True,
                 snip_edges=True):
    '''
    extract spectrogram using kaldi
    args:
        filename: wav file path
        samp_freq: sample frequence
    return:
        spectrogram: (frame, fre)
    '''
    # get rspec and wspec
    with open('wav.scp', 'w') as f:
        f.write('test1 ' + filename + '\n')
    rspec = 'scp,p:' + 'wav.scp'
    wspec = 'ark,t:' + 'spec.ark'
    # set po
    usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>"""
    po = ParseOptions(usage)
    po.register_float("min-duration", 0.0, "minimum segment duration")
    opts = po.parse_args()
    # set options
    spec_opts = SpectrogramOptions()
    spec_opts.frame_opts.samp_freq = samp_freq
    spec_opts.frame_opts.frame_length_ms = frame_length_ms
    spec_opts.frame_opts.frame_shift_ms = frame_shift_ms
    spec_opts.frame_opts.round_to_power_of_two = round_to_power_of_two
    spec_opts.frame_opts.snip_edges = snip_edges
    spec_opts.register(po)
    spec = Spectrogram(spec_opts)
    sf = spec_opts.frame_opts.samp_freq
    with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer:
        for key, wav in reader:
            if wav.duration < opts.min_duration:
                continue
            assert (wav.samp_freq >= sf)
            assert (wav.samp_freq % sf == 0)
            s = wav.data()
            s = s[:, ::int(wav.samp_freq / sf)]
            m = SubVector(mean(s, axis=0))
            f = spec.compute_features(m, sf, 1.0)
            f_array = np.array(f)
            writer[key] = f
    return f_array
    def compute_feat_KALDI(self, wav):
        try:
            po = ParseOptions("")
            mfcc_opts = MfccOptions()
            mfcc_opts.use_energy = False
            mfcc_opts.frame_opts.samp_freq = self.sr
            mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000
            mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000
            mfcc_opts.frame_opts.allow_downsample = False
            mfcc_opts.mel_opts.num_bins = self.num_bins
            mfcc_opts.mel_opts.low_freq = self.low_freq
            mfcc_opts.mel_opts.high_freq = self.high_freq
            mfcc_opts.num_ceps = self.num_ceps
            mfcc_opts.register(po)

            # Create MFCC object and obtain sample frequency
            mfccObj = Mfcc(mfcc_opts)
            mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0)
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "Speaker diarization failed while extracting features!!!")
        else:
            return mfccKaldi
Exemple #5
0
       ...

   Posterior-formatted posterior:
     <uttid> [[(0,0.1), (1,0.89), (5,0.01)],
              [(1,0,9), (5,0.1)],
                ...
              [(0,0.8), (1,0.2)]]
       ... 

  Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier

  e.g.
      feat-to-post scp:feats.scp ark:post.ark

  """
  po = ParseOptions(usage)
  po.register_int("top-n", 10,
                  "only keep highest N posteriors per frame, 10 by default")
  po.register_bool("rescale", False,
                   "rescale top N posteriors to let summation equals to 1, false by default")
  opts = po.parse_args()

  if (po.num_args() != 2):
    po.print_usage()
    sys.exit()

  feature_rspecifier = po.get_arg(1)
  posterior_wspecifier = po.get_arg(2)
  isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier,
                           opts.top_n, opts.rescale)
  if not isSuccess:
Exemple #6
0
    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    if opts.vtln_map:
        vtln_map_reader.close()

    return num_success != 0


if __name__ == '__main__':
    usage = """Create MFCC feature files.

    Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>
    """
    po = ParseOptions(usage)

    mfcc_opts = MfccOptions()
    mfcc_opts.register(po)

    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"
        "file [CMS]; not recommended to do it this way.")
    po.register_float(
        "vtln-warp", 1.0, "Vtln warp factor (only applicable "
        "if vtln-map not specified)")
    po.register_str(
        "vtln-map", "", "Map from utterance or speaker-id to "
        "vtln warp factor (rspecifier)")
    po.register_str(
        "utt2spk", "", "Utterance to speaker-id map rspecifier"
    # Configure log messages to look like Kaldi messages
    from kaldi import __version__
    logging.addLevelName(20, "LOG")
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Convert features into posterior format, which is the generic format
  of NN training target in Karel's nnet1 tools.
  (spped is not an issue for reasonably low NN-output dimensions)
  Usage: feat-to-post.py [options] feature_rspecifier posteriors_wspecifier

  e.g.
      feat-to-post scp:feats.scp ark:post.ark

  """
    po = ParseOptions(usage)
    po.register_int("top-n", 10, "N posteriors per frame, 10 by default")
    opts = po.parse_args()

    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    feature_rspecifier = po.get_arg(1)
    posterior_wspecifier = po.get_arg(2)
    isSuccess = feat_to_post(feature_rspecifier, posterior_wspecifier,
                             opts.top_n)
    if not isSuccess:
        sys.exit()
Exemple #8
0

if __name__ == '__main__':
    # Configure log messages to look like Kaldi messages
    from kaldi import __version__
    logging.addLevelName(20, 'LOG')
    logging.basicConfig(
        format='%(levelname)s (%(module)s[{}]:%(funcName)s():'
        '%(filename)s:%(lineno)s) %(message)s'.format(__version__),
        level=logging.INFO)

    usage = """Extract segments from a large audio file in WAV format.
    Usage:
        extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>
    """
    po = ParseOptions(usage)
    po.register_float(
        "min-segment-length", 0.1, "Minimum segment length "
        "in seconds (reject shorter segments)")
    po.register_float(
        "max_overshoot", 0.5, "End segments overshooting audio "
        "by less than this (in seconds) are truncated, "
        "else rejected.")

    opts = po.parse_args()
    if po.num_args() != 3:
        po.print_usage()
        sys.exit()

    wav_rspecifier = po.get_arg(1)
    segments_rxfilename = po.get_arg(2)
Exemple #9
0
from kaldi.decoder import LatticeFasterDecoderOptions
from kaldi.nnet3 import NnetSimpleLoopedComputationOptions
from kaldi.online2 import (OnlineEndpointConfig,
                           OnlineIvectorExtractorAdaptationState,
                           OnlineNnetFeaturePipelineConfig,
                           OnlineNnetFeaturePipelineInfo,
                           OnlineNnetFeaturePipeline, OnlineSilenceWeighting)
from kaldi.util.options import ParseOptions
from kaldi.util.table import SequentialWaveReader

chunk_size = 1440

# Define online feature pipeline
feat_opts = OnlineNnetFeaturePipelineConfig()
endpoint_opts = OnlineEndpointConfig()
po = ParseOptions("")
feat_opts.register(po)
endpoint_opts.register(po)
po.read_config_file("online.conf")
feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleLoopedComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterOnlineRecognizer.from_files(
    "final.mdl",
Exemple #10
0
    print(
        "Done {} out of {} utterances".format(num_success, num_utts),
        file=sys.stderr,
    )

    return num_success != 0


if __name__ == "__main__":

    usage = """Compute VAD.

    Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
    """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration",
        0.0,
        "Minimum duration of segments to process in seconds (default: 0.0).",
    )
    po.register_int(
        "channel",
        -1,
        "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)",
    )
    po.register_int("frame-window", 25,
                    "Length of frame window in ms (default: 25)")
    po.register_int("frame-shift", 10,
                    "Length of frame shift in ms (default: 10)")
    side's stats).  Reads a 'reco2file_and_channel' file, normally like
    sw02001-A sw02001 A
    sw02001-B sw02001 B
    sw02005-A sw02005 A
    sw02005-B sw02005 B
    interpreted as <utterance-id> <call-id> <side> and for each <call-id>
    that has two sides, does the 'only-the-louder' computation, else does
    per-utterance stats in the normal way.
    Note: loudness is judged by the first feature component, either energy or c0
    only applicable to MFCCs or PLPs (this code could be modified to handle filterbanks).

    Usage: compute-cmvn-stats-two-channel [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier>
    e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:-
    """

    po = ParseOptions(usage)

    po.register_float(
        "quieter_channel_weight", 0.01, "For the quieter channel,"
        " apply this weight to the stats, so that we still get "
        "stats if one channel always dominates.")

    opts = po.parse_args()

    if po.num_args() != 3:
        po.print_usage()
        sys.exit(1)

    reco2file_and_channel_rxfilename = po.get_arg(1)
    feats_rspecifier = po.get_arg(2)
    stats_wspecifier = po.get_arg(3)
Exemple #12
0
    # Configure log messages to look like Kaldi messages
    from kaldi import __version__
    logging.addLevelName(20, "LOG")
    logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
                               "%(filename)s:%(lineno)s) %(message)s"
                               .format(__version__), level=logging.INFO)

    usage = """Decode features using GMM-based model.

    Usage:  gmm-decode-faster.py [options] model-in fst-in features-rspecifier
                words-wspecifier [alignments-wspecifier [lattice-wspecifier]]

    Note: lattices, if output, will just be linear sequences;
          use gmm-latgen-faster if you want "real" lattices.
    """
    po = ParseOptions(usage)
    decoder_opts = FasterDecoderOptions()
    decoder_opts.register(po, True)
    po.register_float("acoustic-scale", 0.1,
                      "Scaling factor for acoustic likelihoods")
    po.register_bool("allow-partial", True,
                     "Produce output even when final state was not reached")
    po.register_str("word-symbol-table", "",
                    "Symbol table for words [for debug output]");
    opts = po.parse_args()

    if po.num_args() < 4 or po.num_args() > 6:
        po.print_usage()
        sys.exit()

    model_rxfilename = po.get_arg(1)
Exemple #13
0
                        help="apply cepstrum mean normalizaiton per utterance")
    parser.add_argument('--sample_rate',
                        type=int,
                        default=16000,
                        help='sample rate of waves')
    parser.add_argument('--feat_config',
                        type=str,
                        default=None,
                        help='feature extraction config file')
    parser.add_argument('--feat_dim',
                        type=int,
                        default=80,
                        help='feature dimension')
    args, unk = parser.parse_known_args()

    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    speed_rate = [0.9, 1.0, 1.1]
    cmvn = Cmvn(args.feat_dim)

    with open(args.data_lst, 'r', encoding='utf-8') as data_lst_f:
        for line in data_lst_f:
            mrk_fn = line.split()[0]
            seq_fn = line.split()[1]
            with open(mrk_fn, 'r', encoding='utf-8') as mrk, \
                 open(seq_fn, 'rb') as seq:
                for mrk_line in mrk:
                    seq.seek(int(mrk_line.split()[1]))
# =============================================================================
# ----------------------------- Model Loading
# =============================================================================
log_file = open(log_filepath, "w")
summ_file = open(summ_filepath, "w")

chunk_size = 1440

# Define online feature pipeline
#feats_args = "--mfcc-config="  + mfcc_hires_path + " " +\
#                    "--ivector-extraction-config=" + ivector_extractor_path +\
#                    "-verbose=1"

feat_opts = OnlineNnetFeaturePipelineConfig()
endpoint_opts = OnlineEndpointConfig()
po = ParseOptions("")
feat_opts.register(po)
endpoint_opts.register(po)
po.read_config_file(online_config_path)
feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleLoopedComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150

print('Loading inference model from files\n {} \n {} \n {}\n'\
Exemple #15
0
if __name__ == '__main__':
  # Configure log messages to look like Kaldi messages
  from kaldi import __version__
  logging.addLevelName(20, "LOG")
  logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
                             "%(filename)s:%(lineno)s) %(message)s"
                             .format(__version__), level=logging.INFO)
  usage = """save the visualization plot of 2-dimensional vectors to hardisk.

  Usage: two-dim-vector-visual.py [options] <vector-rspecifier> <utt2spk-rxfilename> <figure-rxfilename>

  e.g.
      two-dim-vector-visual.py scp:data/train/2d_vectors.scp data/train/utt2spk data/train/2d_vectors.png
  """
  po = ParseOptions(usage)
  opts = po.parse_args()

  if (po.num_args() != 3):
    po.print_usage()
    sys.exit()

  vector_rspecifier = po.get_arg(1)
  utt2spk_rxfilename = po.get_arg(2)
  figure_rxfilename = po.get_arg(3)
  isSuccess = two_dim_vector_visual(vector_rspecifier,
                                    utt2spk_rxfilename,
                                    figure_rxfilename)
  if not isSuccess:
    sys.exit()
    from kaldi import __version__
    logging.addLevelName(20, "LOG")
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use MiniBatchKMeans for vector clustering. It outputs cluster assignments
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans

  Usage: kmeans-vector.py [options] <vector-rspecifier> <utt2clusterid-rxfilename>

  e.g.
      kmeans-vector.py scp:data/train/ivector.scp data/train/utt2clusterid
  """
    po = ParseOptions(usage)
    po.register_int(
        "n-clusters", 8,
        "The number of clusters to form as well as the number of centroids to generate. default=8"
    )
    po.register_int(
        "random-state", 0,
        "Determines random number generation for centroid initialization and random reassignment. "
        "Use an int to make the randomness deterministic. ")
    po.register_int("batch-size", 6, "Size of the mini batches.")
    po.register_int(
        "max-iter", 100,
        "Maximum number of iterations over the complete dataset before stopping independently of "
        "any early stopping criterion heuristics.")
    opts = po.parse_args()
Exemple #17
0
    from kaldi import __version__
    logging.addLevelName(20, "LOG")
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use t-sne (t-distributed Stochastic Neighbor Emedding) for dimension reduction.
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE

  Usage: tsne-vector.py [options] <vector-rspecifier> <vector-wspecifier

  e.g.
      tsne-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark
  """
    po = ParseOptions(usage)
    po.register_int(
        "output-dim", 2, "dimension of the output vectors."
        " For visualization, only 2 is allowed in this program. (2 by default)"
    )
    po.register_double(
        "perplexity", 30,
        "The perplexity is related to the number of nearest neighbors that is used"
        " in other mainfold learning algorithms. Large datasets usually require a"
        " large perplexity. Consider selecting a value between 5 and 50. Different"
        " values can result in significantly different results. (30 by default)"
    )
    po.register_double(
        "learning-rate", 200.0,
        "The learning rate for t-sne is usually in the range [10.0, 1000.0]. If the"
        " learning rate is too high, the data may look like a \'ball\' with any point"
Exemple #18
0
def otf_utt_generator(data_triplets, rir, noise, args):
    """
    Args:
        data_lst: list of mrk and seq of input audios, and label ark
        rir: list of rir, List[AudioSegment]
        noise: list of noise, List[AudioSegment]
        args: argumnets for loader
    """
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)

    batch_idx = 0
    valid_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1

    #rates for speed perturbation
    speed_rate = [float(rate) for rate in args.speed_rate.split(',')]
    #volume level perturbation
    gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')]
    #snr range for noise perturbation: 0-20db with mean of 10
    #mu, sigma = 10, 10
    #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma
    #Fbank config
    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    #fbank_opt = MfccOptions()
    #fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    #fbank = Mfcc(fbank_opt)

    for data_triplet in data_triplets:
        mrk_fn, seq_fn = data_triplet[0], data_triplet[1]
        ali_rspec = data_triplet[2]
        with open(mrk_fn, 'r', encoding='utf-8') as mrk,\
             open(seq_fn, 'rb') as seq:
            ali_reader = SequentialIntVectorReader(ali_rspec)
            for line, (uttid1, ali) in zip(mrk, ali_reader):
                uttid = line.split()[0]
                assert uttid == uttid1
                seq.seek(int(line.split()[1]))
                num_bytes = int(line.split()[2])
                num_bytes -= num_bytes % 2
                audio_bytes = seq.read(num_bytes)
                audio_np = np.frombuffer(audio_bytes, dtype='int16')
                #data augmentation function goes here
                audio_seg = AudioSegment(audio_np, args.sample_rate)
                #speed perturbation
                spr = speed_rate[randint(0, len(speed_rate) - 1)]
                audio_seg.change_speed(spr)
                audio_seg.normalize(np.random.uniform(gain_lo, gain_hi))
                #noise adding example:
                #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1)
                #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr)
                #rir adding example:
                #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)])
                audio_np = audio_seg._convert_samples_from_float32(\
                                     audio_seg.samples, 'int16')
                wave_1ch = Vector(audio_np)
                feats = fbank.compute_features(wave_1ch,
                                               args.sample_rate,
                                               vtnl_warp=1.0)
                ali = np.array(ali)
                if args.reverse_labels:
                    ali = ali[::-1]
                if args.SOS >= 0:
                    ali = np.concatenate(([args.SOS], ali))
                if args.EOS >= 0:
                    ali = np.concatenate((ali, [args.EOS]))
                feats = _matrix_ext.matrix_to_numpy(feats)
                utt_len = feats.shape[0] // args.stride + \
                          int(feats.shape[0] % args.stride != 0)
                #limits on T*U products due to RNNT.
                #this is pretty hacky now
                if ali.shape[0] * utt_len // 3 <= args.TU_limit:
                    ali_len[valid_idx] = ali.shape[0]
                    data_buffer[valid_idx, :utt_len, :] = \
                        splice(feats, args.lctx, args.rctx)[::args.stride]
                    target_buffer[valid_idx, :ali_len[valid_idx]] = ali
                    len_buffer[valid_idx] = utt_len
                    if utt_len > batch_max_len:
                        batch_max_len = utt_len
                    if ali_len[valid_idx] > target_max_len:
                        target_max_len = ali_len[valid_idx]
                    valid_idx += 1

                batch_idx += 1

                if batch_idx == batch_size:
                    for b in range(valid_idx):
                        utt_len = len_buffer[b]
                        target_len = ali_len[b]
                        #data and target padding
                        if utt_len > 0:
                            data_buffer[b, utt_len:batch_max_len, :] = \
                                data_buffer[b, utt_len-1, :]
                            target_buffer[b, target_len:target_max_len] = \
                                args.padding_tgt

                    data = data_buffer[:valid_idx, :batch_max_len, :]
                    target = target_buffer[:valid_idx, :target_max_len]

                    if not args.batch_first:
                        data = np.transpose(data, (1, 0, 2))
                        target = np.transpose(target, (1, 0))

                    data = torch.from_numpy(np.copy(data))
                    target = torch.from_numpy(np.copy(target))
                    lens = torch.from_numpy(np.copy(len_buffer[:valid_idx]))
                    ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx]))

                    if valid_idx > 0:
                        #not doing cuda() here, in main process instead
                        yield data, target, lens, ali_lens
                    else:
                        yield None, None, \
                              torch.IntTensor([0]), torch.IntTensor([0])

                    batch_idx = 0
                    valid_idx = 0
                    target_len = 0
                    batch_max_len = -1
                    target_max_len = -1

            ali_reader.close()

    yield None
Exemple #19
0
if __name__ == '__main__':
  # Configure log messages to look like Kaldi messages
  from kaldi import __version__
  logging.addLevelName(20, "LOG")
  logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
                             "%(filename)s:%(lineno)s) %(message)s"
                             .format(__version__), level=logging.INFO)
  usage = """Compute the counts of *feature-formatted* posterior for each mixture. 
  If --normalize=True and --per-utt=False, the counts will be averaged by the
    number of utterances.
  Usage: post-count.py [options] feature_rspecifier posteriors_wspecifier

  e.g.
      post-count scp:feats.scp ark,t:count.txt

  """
  po = ParseOptions(usage)
  po.register_bool("normalize", False, "normalize the counts, False by default")
  po.register_bool("per-utt", False, "Count per utterance, False by default")
  opts = po.parse_args()

  if (po.num_args() != 2):
    po.print_usage()
    sys.exit()

  feature_rspecifier = po.get_arg(1)
  posterior_wspecifier = po.get_arg(2)
  isSuccess = post_to_count(feature_rspecifier, posterior_wspecifier, normalize=opts.normalize, per_utt=opts.per_utt)
  if not isSuccess:
    sys.exit()
Exemple #20
0
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    return num_success != 0


if __name__ == "__main__":

    usage = """Compute VAD.

  Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
  """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_int("frame-window", 25, "Length of frame window in ms "
                    "default is 25ms")
    po.register_int("frame-shift", 10, "Length of frame shift in ms "
                    "default is 10ms")
    po.register_int("nfft", 256, "Number of DFT points " "default is 256")
    po.register_int(
        "arma-order", 5, "Length of ARMA window that will be applied "
        "to the spectrogram")
Exemple #21
0

if __name__ == '__main__':
    usage = """Copy matrices, or archives of matrices (e.g. features or transforms)
    Also see copy-feats which has other format options


    Usage: copy-matrix [options] <matrix-in-rspecifier> <matrix-out-wspecifier>
    or     copy-matrix [options] <matrix-in-rxfilename> <matrix-out-wxfilename>

    e.g.
        copy-matrix --binary=false 1.mat -
        copy-matrix ark:2.trans ark,t:-
    """

    po = ParseOptions(usage)

    po.register_bool(
        "binary", True,
        "Write in binary mode (only relevant if output is a wxfilename)")
    po.register_float(
        "scale", 1.0,
        "This option can be used to scale the matrices being copied.")
    po.register_bool(
        "apply-log", False,
        "This option can be used to apply log on the matrices. Must be avoided if matrix has negative quantities."
    )
    po.register_bool("apply-exp", False,
                     "This option can be used to apply exp on the matrices")
    po.register_float(
        "apply-power", 1.0,
Exemple #22
0
    logging.basicConfig(
        format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
        "%(filename)s:%(lineno)s) %(message)s".format(__version__),
        level=logging.INFO)
    usage = """Use Principal component analysis for dimension reduction.
  For the details, Please refer to website:
  https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

  Usage: pca-vector.py [options] <vector-rspecifier> <vector-wspecifier

  e.g.
      pca-vector.py scp:data/train/ivector.scp ark:data/train/low_dim_vector.ark

  see also: two-dim-vector-visual.py
  """
    po = ParseOptions(usage)
    po.register_int(
        "output-dim", 2, "dimension of the output vectors."
        " For visualization, only 2 is allowed in this program. (2 by default)"
    )
    opts = po.parse_args()
    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    vector_rspecifier = po.get_arg(1)
    vector_wspecifier = po.get_arg(2)
    isSuccess = pca_vector(vector_rspecifier,
                           vector_wspecifier,
                           output_dim=opts.output_dim)
    if not isSuccess:
            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    return num_success != 0


if __name__ == '__main__':
    usage = """Create MFCC feature files.

    Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>
    """
    po = ParseOptions(usage)

    mfcc_opts = MfccOptions()
    mfcc_opts.register(po)

    po.register_int("sampling-rate", 16000,
                    "Sampling rate of waveforms and labels.")
    po.register_int(
        "signal-window-length", 200,
        "Window length in ms (what will be presented to the network).")
    po.register_int("label-window-length", 25,
                    "Window length of alignments / labels in ms.")
    po.register_int("label-window-shift", 10,
                    "Window shift of alignments / labels in ms.")
    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"