def load_model(config_file, online_config, models_path='models/', beam_size=10, frames_per_chunk=50): # Read YAML file with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] print(decoder_yaml_opts) feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() if not os.path.isfile(online_config): print(online_config + ' does not exists. Trying to create it from yaml file settings.') print( 'See also online_config_options.info.txt for what possible settings are.' ) with open(online_config, 'w') as online_config_file: online_config_file.write("--add_pitch=False\n") online_config_file.write("--mfcc_config=" + models_path + decoder_yaml_opts['mfcc-config'] + "\n") online_config_file.write("--feature_type=mfcc\n") online_config_file.write( "--ivector_extraction_config=" + models_path + decoder_yaml_opts['ivector-extraction-config'] + '\n') online_config_file.write( "--endpoint.silence-phones=" + decoder_yaml_opts['endpoint-silence-phones'] + '\n') else: print("Loading online conf from:", online_config) po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file(online_config) feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = beam_size decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = frames_per_chunk asr = NnetLatticeFasterOnlineRecognizer.from_files( models_path + decoder_yaml_opts["model"], models_path + decoder_yaml_opts["fst"], models_path + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts, endpoint_opts=endpoint_opts) return asr, feat_info, decodable_opts
def LoadModels(self): try: # Define online feature pipeline po = ParseOptions("") decoder_opts = LatticeFasterDecoderOptions() self.endpoint_opts = OnlineEndpointConfig() self.decodable_opts = NnetSimpleLoopedComputationOptions() feat_opts = OnlineNnetFeaturePipelineConfig() decoder_opts.register(po) self.endpoint_opts.register(po) self.decodable_opts.register(po) feat_opts.register(po) po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf") self.feat_info = OnlineNnetFeaturePipelineInfo.from_config( feat_opts) # Set metadata parameters self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000 self.acwt = self.decodable_opts.acoustic_scale # Load Acoustic and graph models and other files self.transition_model, self.acoustic_model = NnetRecognizer.read_model( self.AM_PATH + "/final.mdl") graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst") self.decoder_graph = LatticeFasterOnlineDecoder( graph, decoder_opts) self.symbols = _fst.SymbolTable.read_text(self.LM_PATH + "/words.txt") self.info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int") self.asr = NnetLatticeFasterOnlineRecognizer( self.transition_model, self.acoustic_model, self.decoder_graph, self.symbols, decodable_opts=self.decodable_opts, endpoint_opts=self.endpoint_opts) del graph, decoder_opts except Exception as e: self.log.error(e) raise ValueError( "AM and LM loading failed!!! (see logs for more details)")
OnlineIvectorExtractorAdaptationState, OnlineNnetFeaturePipelineConfig, OnlineNnetFeaturePipelineInfo, OnlineNnetFeaturePipeline, OnlineSilenceWeighting) from kaldi.util.options import ParseOptions from kaldi.util.table import SequentialWaveReader chunk_size = 1440 # Define online feature pipeline feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file("online.conf") feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterOnlineRecognizer.from_files( "final.mdl", "HCLG.fst", "words.txt", decoder_opts=decoder_opts,
def otf_utt_generator(data_triplets, rir, noise, args): """ Args: data_lst: list of mrk and seq of input audios, and label ark rir: list of rir, List[AudioSegment] noise: list of noise, List[AudioSegment] args: argumnets for loader """ max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 #rates for speed perturbation speed_rate = [float(rate) for rate in args.speed_rate.split(',')] #volume level perturbation gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')] #snr range for noise perturbation: 0-20db with mean of 10 #mu, sigma = 10, 10 #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma #Fbank config po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) #fbank_opt = MfccOptions() #fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) #fbank = Mfcc(fbank_opt) for data_triplet in data_triplets: mrk_fn, seq_fn = data_triplet[0], data_triplet[1] ali_rspec = data_triplet[2] with open(mrk_fn, 'r', encoding='utf-8') as mrk,\ open(seq_fn, 'rb') as seq: ali_reader = SequentialIntVectorReader(ali_rspec) for line, (uttid1, ali) in zip(mrk, ali_reader): uttid = line.split()[0] assert uttid == uttid1 seq.seek(int(line.split()[1])) num_bytes = int(line.split()[2]) num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') #data augmentation function goes here audio_seg = AudioSegment(audio_np, args.sample_rate) #speed perturbation spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) audio_seg.normalize(np.random.uniform(gain_lo, gain_hi)) #noise adding example: #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1) #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr) #rir adding example: #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)]) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) ali = np.array(ali) if args.reverse_labels: ali = ali[::-1] if args.SOS >= 0: ali = np.concatenate(([args.SOS], ali)) if args.EOS >= 0: ali = np.concatenate((ali, [args.EOS])) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) #limits on T*U products due to RNNT. #this is pretty hacky now if ali.shape[0] * utt_len // 3 <= args.TU_limit: ali_len[valid_idx] = ali.shape[0] data_buffer[valid_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[valid_idx, :ali_len[valid_idx]] = ali len_buffer[valid_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[valid_idx] > target_max_len: target_max_len = ali_len[valid_idx] valid_idx += 1 batch_idx += 1 if batch_idx == batch_size: for b in range(valid_idx): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding if utt_len > 0: data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = \ args.padding_tgt data = data_buffer[:valid_idx, :batch_max_len, :] target = target_buffer[:valid_idx, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = torch.from_numpy(np.copy(data)) target = torch.from_numpy(np.copy(target)) lens = torch.from_numpy(np.copy(len_buffer[:valid_idx])) ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx])) if valid_idx > 0: #not doing cuda() here, in main process instead yield data, target, lens, ali_lens else: yield None, None, \ torch.IntTensor([0]), torch.IntTensor([0]) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 ali_reader.close() yield None
default=16000, help='sample rate of waves') parser.add_argument('--feat_config', type=str, default=None, help='feature extraction config file') parser.add_argument('--feat_dim', type=int, default=80, help='feature dimension') args, unk = parser.parse_known_args() po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) speed_rate = [0.9, 1.0, 1.1] cmvn = Cmvn(args.feat_dim) with open(args.data_lst, 'r', encoding='utf-8') as data_lst_f: for line in data_lst_f: mrk_fn = line.split()[0] seq_fn = line.split()[1] with open(mrk_fn, 'r', encoding='utf-8') as mrk, \ open(seq_fn, 'rb') as seq: for mrk_line in mrk: seq.seek(int(mrk_line.split()[1])) num_bytes = int(mrk_line.split()[2]) #this is making sure even number of bytes num_bytes -= num_bytes % 2
log_file = open(log_filepath, "w") summ_file = open(summ_filepath, "w") chunk_size = 1440 # Define online feature pipeline #feats_args = "--mfcc-config=" + mfcc_hires_path + " " +\ # "--ivector-extraction-config=" + ivector_extractor_path +\ # "-verbose=1" feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file(online_config_path) feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 print('Loading inference model from files\n {} \n {} \n {}\n'\ .format(model_path, graph_path, symbols_path), file=sys.stderr) log_file.write('Loading inference model from files\n {} \n {} \n {}\n'\