Esempio n. 1
0
def load_model(config_file,
               online_config,
               models_path='models/',
               beam_size=10,
               frames_per_chunk=50):
    # Read YAML file
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)

    decoder_yaml_opts = model_yaml['decoder']

    print(decoder_yaml_opts)

    feat_opts = OnlineNnetFeaturePipelineConfig()
    endpoint_opts = OnlineEndpointConfig()

    if not os.path.isfile(online_config):
        print(online_config +
              ' does not exists. Trying to create it from yaml file settings.')
        print(
            'See also online_config_options.info.txt for what possible settings are.'
        )
        with open(online_config, 'w') as online_config_file:
            online_config_file.write("--add_pitch=False\n")
            online_config_file.write("--mfcc_config=" + models_path +
                                     decoder_yaml_opts['mfcc-config'] + "\n")
            online_config_file.write("--feature_type=mfcc\n")
            online_config_file.write(
                "--ivector_extraction_config=" + models_path +
                decoder_yaml_opts['ivector-extraction-config'] + '\n')
            online_config_file.write(
                "--endpoint.silence-phones=" +
                decoder_yaml_opts['endpoint-silence-phones'] + '\n')
    else:
        print("Loading online conf from:", online_config)

    po = ParseOptions("")
    feat_opts.register(po)
    endpoint_opts.register(po)
    po.read_config_file(online_config)
    feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = beam_size
    decoder_opts.max_active = 7000
    decodable_opts = NnetSimpleLoopedComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = frames_per_chunk
    asr = NnetLatticeFasterOnlineRecognizer.from_files(
        models_path + decoder_yaml_opts["model"],
        models_path + decoder_yaml_opts["fst"],
        models_path + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts,
        decodable_opts=decodable_opts,
        endpoint_opts=endpoint_opts)

    return asr, feat_info, decodable_opts
Esempio n. 2
0
    def LoadModels(self):
        try:
            # Define online feature pipeline
            po = ParseOptions("")

            decoder_opts = LatticeFasterDecoderOptions()
            self.endpoint_opts = OnlineEndpointConfig()
            self.decodable_opts = NnetSimpleLoopedComputationOptions()
            feat_opts = OnlineNnetFeaturePipelineConfig()

            decoder_opts.register(po)
            self.endpoint_opts.register(po)
            self.decodable_opts.register(po)
            feat_opts.register(po)

            po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf")
            self.feat_info = OnlineNnetFeaturePipelineInfo.from_config(
                feat_opts)

            # Set metadata parameters
            self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq
            self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000
            self.acwt = self.decodable_opts.acoustic_scale

            # Load Acoustic and graph models and other files
            self.transition_model, self.acoustic_model = NnetRecognizer.read_model(
                self.AM_PATH + "/final.mdl")
            graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst")
            self.decoder_graph = LatticeFasterOnlineDecoder(
                graph, decoder_opts)
            self.symbols = _fst.SymbolTable.read_text(self.LM_PATH +
                                                      "/words.txt")
            self.info = WordBoundaryInfo.from_file(
                WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int")

            self.asr = NnetLatticeFasterOnlineRecognizer(
                self.transition_model,
                self.acoustic_model,
                self.decoder_graph,
                self.symbols,
                decodable_opts=self.decodable_opts,
                endpoint_opts=self.endpoint_opts)
            del graph, decoder_opts
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "AM and LM loading failed!!! (see logs for more details)")
Esempio n. 3
0
                           OnlineIvectorExtractorAdaptationState,
                           OnlineNnetFeaturePipelineConfig,
                           OnlineNnetFeaturePipelineInfo,
                           OnlineNnetFeaturePipeline, OnlineSilenceWeighting)
from kaldi.util.options import ParseOptions
from kaldi.util.table import SequentialWaveReader

chunk_size = 1440

# Define online feature pipeline
feat_opts = OnlineNnetFeaturePipelineConfig()
endpoint_opts = OnlineEndpointConfig()
po = ParseOptions("")
feat_opts.register(po)
endpoint_opts.register(po)
po.read_config_file("online.conf")
feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleLoopedComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterOnlineRecognizer.from_files(
    "final.mdl",
    "HCLG.fst",
    "words.txt",
    decoder_opts=decoder_opts,
Esempio n. 4
0
def otf_utt_generator(data_triplets, rir, noise, args):
    """
    Args:
        data_lst: list of mrk and seq of input audios, and label ark
        rir: list of rir, List[AudioSegment]
        noise: list of noise, List[AudioSegment]
        args: argumnets for loader
    """
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)

    batch_idx = 0
    valid_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1

    #rates for speed perturbation
    speed_rate = [float(rate) for rate in args.speed_rate.split(',')]
    #volume level perturbation
    gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')]
    #snr range for noise perturbation: 0-20db with mean of 10
    #mu, sigma = 10, 10
    #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma
    #Fbank config
    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    #fbank_opt = MfccOptions()
    #fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    #fbank = Mfcc(fbank_opt)

    for data_triplet in data_triplets:
        mrk_fn, seq_fn = data_triplet[0], data_triplet[1]
        ali_rspec = data_triplet[2]
        with open(mrk_fn, 'r', encoding='utf-8') as mrk,\
             open(seq_fn, 'rb') as seq:
            ali_reader = SequentialIntVectorReader(ali_rspec)
            for line, (uttid1, ali) in zip(mrk, ali_reader):
                uttid = line.split()[0]
                assert uttid == uttid1
                seq.seek(int(line.split()[1]))
                num_bytes = int(line.split()[2])
                num_bytes -= num_bytes % 2
                audio_bytes = seq.read(num_bytes)
                audio_np = np.frombuffer(audio_bytes, dtype='int16')
                #data augmentation function goes here
                audio_seg = AudioSegment(audio_np, args.sample_rate)
                #speed perturbation
                spr = speed_rate[randint(0, len(speed_rate) - 1)]
                audio_seg.change_speed(spr)
                audio_seg.normalize(np.random.uniform(gain_lo, gain_hi))
                #noise adding example:
                #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1)
                #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr)
                #rir adding example:
                #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)])
                audio_np = audio_seg._convert_samples_from_float32(\
                                     audio_seg.samples, 'int16')
                wave_1ch = Vector(audio_np)
                feats = fbank.compute_features(wave_1ch,
                                               args.sample_rate,
                                               vtnl_warp=1.0)
                ali = np.array(ali)
                if args.reverse_labels:
                    ali = ali[::-1]
                if args.SOS >= 0:
                    ali = np.concatenate(([args.SOS], ali))
                if args.EOS >= 0:
                    ali = np.concatenate((ali, [args.EOS]))
                feats = _matrix_ext.matrix_to_numpy(feats)
                utt_len = feats.shape[0] // args.stride + \
                          int(feats.shape[0] % args.stride != 0)
                #limits on T*U products due to RNNT.
                #this is pretty hacky now
                if ali.shape[0] * utt_len // 3 <= args.TU_limit:
                    ali_len[valid_idx] = ali.shape[0]
                    data_buffer[valid_idx, :utt_len, :] = \
                        splice(feats, args.lctx, args.rctx)[::args.stride]
                    target_buffer[valid_idx, :ali_len[valid_idx]] = ali
                    len_buffer[valid_idx] = utt_len
                    if utt_len > batch_max_len:
                        batch_max_len = utt_len
                    if ali_len[valid_idx] > target_max_len:
                        target_max_len = ali_len[valid_idx]
                    valid_idx += 1

                batch_idx += 1

                if batch_idx == batch_size:
                    for b in range(valid_idx):
                        utt_len = len_buffer[b]
                        target_len = ali_len[b]
                        #data and target padding
                        if utt_len > 0:
                            data_buffer[b, utt_len:batch_max_len, :] = \
                                data_buffer[b, utt_len-1, :]
                            target_buffer[b, target_len:target_max_len] = \
                                args.padding_tgt

                    data = data_buffer[:valid_idx, :batch_max_len, :]
                    target = target_buffer[:valid_idx, :target_max_len]

                    if not args.batch_first:
                        data = np.transpose(data, (1, 0, 2))
                        target = np.transpose(target, (1, 0))

                    data = torch.from_numpy(np.copy(data))
                    target = torch.from_numpy(np.copy(target))
                    lens = torch.from_numpy(np.copy(len_buffer[:valid_idx]))
                    ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx]))

                    if valid_idx > 0:
                        #not doing cuda() here, in main process instead
                        yield data, target, lens, ali_lens
                    else:
                        yield None, None, \
                              torch.IntTensor([0]), torch.IntTensor([0])

                    batch_idx = 0
                    valid_idx = 0
                    target_len = 0
                    batch_max_len = -1
                    target_max_len = -1

            ali_reader.close()

    yield None
Esempio n. 5
0
                        default=16000,
                        help='sample rate of waves')
    parser.add_argument('--feat_config',
                        type=str,
                        default=None,
                        help='feature extraction config file')
    parser.add_argument('--feat_dim',
                        type=int,
                        default=80,
                        help='feature dimension')
    args, unk = parser.parse_known_args()

    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    speed_rate = [0.9, 1.0, 1.1]
    cmvn = Cmvn(args.feat_dim)

    with open(args.data_lst, 'r', encoding='utf-8') as data_lst_f:
        for line in data_lst_f:
            mrk_fn = line.split()[0]
            seq_fn = line.split()[1]
            with open(mrk_fn, 'r', encoding='utf-8') as mrk, \
                 open(seq_fn, 'rb') as seq:
                for mrk_line in mrk:
                    seq.seek(int(mrk_line.split()[1]))
                    num_bytes = int(mrk_line.split()[2])
                    #this is making sure even number of bytes
                    num_bytes -= num_bytes % 2
log_file = open(log_filepath, "w")
summ_file = open(summ_filepath, "w")

chunk_size = 1440

# Define online feature pipeline
#feats_args = "--mfcc-config="  + mfcc_hires_path + " " +\
#                    "--ivector-extraction-config=" + ivector_extractor_path +\
#                    "-verbose=1"

feat_opts = OnlineNnetFeaturePipelineConfig()
endpoint_opts = OnlineEndpointConfig()
po = ParseOptions("")
feat_opts.register(po)
endpoint_opts.register(po)
po.read_config_file(online_config_path)
feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts)

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleLoopedComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150

print('Loading inference model from files\n {} \n {} \n {}\n'\
      .format(model_path, graph_path, symbols_path),
          file=sys.stderr)
log_file.write('Loading inference model from files\n {} \n {} \n {}\n'\