def compute_full_ppg(nnet: Nnet, feats: Matrix) -> Matrix: """Compute full PPG features given appropriate input features. Args: nnet: An neural network AM. feats: Suitable T*D input feature matrix. Returns: raw_ppgs: T*K raw PPGs, K is the number of senones. """ # Obtain the nnet computer, for some unknown reason, the computer must be # constructed within this function. nnet3.set_batchnorm_test_mode(True, nnet) nnet3.set_dropout_test_mode(True, nnet) nnet3.collapse_model(nnet3.CollapseModelConfig(), nnet) opts = nnet3.NnetSimpleComputationOptions() opts.acoustic_scale = 1.0 compiler = nnet3.CachingOptimizingCompiler. \ new_with_optimize_opts(nnet, opts.optimize_config) priors = Vector() # We do not need prior nnet_computer = nnet3.DecodableNnetSimple(opts, nnet, priors, feats, compiler) # Obtain frame-level PPGs raw_ppgs = Matrix(nnet_computer.num_frames(), nnet_computer.output_dim()) for i in range(nnet_computer.num_frames()): temp = Vector(nnet_computer.output_dim()) nnet_computer.get_output_for_frame(i, temp) raw_ppgs.copy_row_from_vec_(temp, i) return raw_ppgs
def computeVAD_KALDI(self, feats): try: vadStream = compute_vad_energy(self.vad_ops, feats) vad = Vector(vadStream) VAD = vad.numpy() # segmentation occurence = [] value = [] occurence.append(1) value.append(VAD[0]) # compute the speech and non-speech frames for i in range(1, len(VAD)): if value[-1] == VAD[i]: occurence[-1] += 1 else: occurence.append(1) value.append(VAD[i]) # filter the speech and non-speech segments that are below 30 frames i = 0 while(i < len(occurence)): if i != 0 and (occurence[i] < 30 or value[i-1] == value[i]): occurence[i-1] += occurence[i] del value[i] del occurence[i] else: i += 1 # split if and only if the silence is above 50 frames i = 0 while(i < len(occurence)): if i != 0 and ((occurence[i] < 30 and value[i] == 0.0) or value[i-1] == value[i]): occurence[i-1] += occurence[i] del value[i] del occurence[i] else: i += 1 # compute VAD mask maskSAD = np.zeros(len(VAD)) start = 0 for i in range(len(occurence)): if value[i] == 1.0: end = start+occurence[i] maskSAD[start:end] = 1 start = end else: start += occurence[i] maskSAD = np.expand_dims(maskSAD, axis=0) except ValueError as v: self.log.error(v) except Exception as e: self.log.error(e) raise ValueError( "Speaker diarization failed while voice activity detection!!!") else: return maskSAD
def feat_pipeline(vec, freq): feats = base.compute_features(vec, freq, 1.0) voice = Vector(compute_vad_energy( vad_opts, feats)) # Use origin mfcc to computed delta_feats = compute_deltas(delta_opts, feats) sliding_feats = Matrix(delta_feats.num_rows, delta_feats.num_cols) sliding_window_cmn(sliding_opts, delta_feats, sliding_feats) if not voice.sum(): LOG.warning('No features were judged as voiced for utterance') return False dim = int(voice.sum()) voice_feats = Matrix(dim, delta_feats.num_cols) feats = kaldi_Matrix(sliding_feats) index = 0 for i, sub_vec in enumerate(feats): if voice[i] != 0 and voice[i] == 1: voice_feats.row(index).copy_row_from_mat_(feats, i) index += 1 LOG.debug('Feats extract successed') return voice_feats
def testCuVectorSwap(self): N = [2, 3, 5, 7, 13] A = Vector(N).clone() C = CuVector.new_from_size(5) C.swap(A) #Swap *is* destructive self.assertEqual(16.0, C.norm(2)) A = Vector() C = CuVector.new_from_size(0) C.swap(A) self.assertEqual(0.0, C.norm(2))
def test_nnet_decodable(self): gen_config = NnetGenerationOptions() configs = generate_config_sequence(gen_config) nnet = Nnet() for j, config in enumerate(configs): print("Input config[{}]:".format(j)) print(config) istrm = istringstream.from_str(config) nnet.read_config(istrm) num_frames = 5 + random.randint(1, 100) input_dim = nnet.input_dim("input") output_dim = nnet.output_dim("output") ivector_dim = max(0, nnet.input_dim("ivector")) input = Matrix(num_frames, input_dim) set_batchnorm_test_mode(True, nnet) set_dropout_test_mode(True, nnet) input.set_randn_() ivector = Vector(ivector_dim) ivector.set_randn_() priors = Vector(output_dim if random.choice([True, False]) else 0) if len(priors) != 0: priors.set_randn_() priors.apply_exp_() output1 = Matrix(num_frames, output_dim) output2 = Matrix(num_frames, output_dim) opts = NnetSimpleComputationOptions() opts.frames_per_chunk = random.randint(5, 25) compiler = CachingOptimizingCompiler(nnet) decodable = DecodableNnetSimple(opts, nnet, priors, input, compiler, ivector if ivector_dim else None) for t in range(num_frames): decodable.get_output_for_frame(t, output1[t]) opts = NnetSimpleLoopedComputationOptions() info = DecodableNnetSimpleLoopedInfo.new_from_priors( opts, priors, nnet) decodable = DecodableNnetSimpleLooped(info, input, ivector if ivector_dim else None) for t in range(num_frames): decodable.get_output_for_frame(t, output2[t]) if (not nnet_is_recurrent(nnet) and nnet.info().find("statistics-extraction") == -1 and nnet.info().find("TimeHeightConvolutionComponent") == -1): for t in range(num_frames): self.assertTrue(approx_equal(output1[t], output2[t]))
def init_rand_diag_gmm(gmm): num_comp, dim = gmm.num_gauss(), gmm.dim() weights = Vector([kaldi_math.rand_uniform() for _ in range(num_comp)]) tot_weigth = weights.sum() for i, m in enumerate(weights): weights[i] = m / tot_weigth means = Matrix([[kaldi_math.rand_gauss() for _ in range(dim)] for _ in range(num_comp)]) vars_ = Matrix([[kaldi_math.exp(kaldi_math.rand_gauss()) for _ in range(dim)] for _ in range(num_comp)]) vars_.invert_elements_() gmm.set_weights(weights) gmm.set_inv_vars_and_means(vars_, means) gmm.perturb(0.5 * kaldi_math.rand_uniform()) gmm.compute_gconsts()
def read_wav_kaldi_internal(wav, fs) -> WaveData: """Internal function for converting wave data to Kaldi format. This function will only keep the first channel. Args: wav: S*C ndarray. S is number of samples and C is number of channels. fs: Sampling frequency. Returns: wd: A Kaldi-readable WaveData object. """ # Only keep the first channel if more than one if wav.ndim >= 2: wav = wav[:, 0] # Save to a Kaldi matrix, per Kaldi's requirement. wav_kaldi = Matrix(1, len(wav)) wav_kaldi.copy_rows_from_vec_(Vector(wav)) if hasattr(WaveData, 'new'): wd = WaveData.new(fs, wav_kaldi) elif hasattr(WaveData, 'from_data'): wd = WaveData.from_data(fs, wav_kaldi) else: wd = None logging.error('Unknown Pykaldi package.') return wd
def compute_pitch_feats_and_post(data): pitch_opts = PitchExtractionOptions() post_opts = ProcessPitchOptions() wav_vector = Vector(data) feats = compute_and_process_kaldi_pitch(pitch_opts, post_opts, wav_vector) feats_data = feats.numpy() return feats_data
def post_to_count(feature_rspecifier, cnt_wspecifier, normalize=False, per_utt=False): with SequentialMatrixReader(feature_rspecifier) as feature_reader, \ VectorWriter(cnt_wspecifier) as cnt_writer: if per_utt: for uttid, feat in feature_reader: cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0)) else: vec = 0 num_done = 0 for uttid, feat in feature_reader: vec = vec + feat.numpy().mean(axis=0) num_done = num_done + 1 if normalize: vec = vec / num_done cnt_writer[str(num_done)] = Vector(vec) return True
def add_vec_(self): v = self.vector_class(5).set_randn_() v1 = self.vector_class(5).set_zero_() self.assertEqual(v, v.add_vec_(v1)) v1 = v1.set_randn_() self.assertNotEqual(v, v.add_vec_(v1)) v1 = v.clone() v1 = v1.scale_(-1.0) self.assertEqual(Vector(5), v.add_vec_(1.0, v1))
def feat_to_count(feature_rspecifier, cnt_wspecifier, normalize=False, per_utt=False): with SequentialMatrixReader(feature_rspecifier) as feature_reader, \ VectorWriter(cnt_wspecifier) as cnt_writer: if per_utt: for uttid, feat in feature_reader: cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0)) else: vec = 0 num_done = 0 for uttid, feat in feature_reader: vec = vec + feat.numpy().mean(axis=0) num_done = num_done + 1 if normalize: vec = vec / num_done # post = zip(range(len(vec)), vec.tolist()) # posterior_writer[str(num_done)] = Posterior().from_posteriors([post]) cnt_writer[str(num_done)] = Vector(vec) return True
def write(self, key, value): """Writes the `(key, value)` pair to the table. This method is provided for compatibility with the C++ API only; most users should use the Pythonic API. Overrides write to accept both Vector and SubVector. Args: key (str): The key. value: The value. """ super(VectorWriter, self).write(key, Vector(value))
def apply_feat_transform(feats: Matrix, transform: Matrix) -> Matrix: """Apply an LDA/fMLLR transform on the input features. The transform is a simple matrix multiplication: F = FT' (' is transpose) in the case of LDA. For fMLLR, please see http://kaldi-asr.org/doc/transform.html#transform_cmllr_global This function is an extremely simplified version of https://github.com/kaldi-asr/kaldi/blob/5.3/src/featbin/transform-feats.cc Args: feats: A T*D feature matrix. transform: A D'*D matrix, where D' is the output feature dim. Returns: feats_out: A T*D' matrix. """ feat_dim = feats.num_cols transform_rows = transform.num_rows transform_cols = transform.num_cols feats_out = Matrix(feats.num_rows, transform_rows) if transform_cols == feat_dim: feats_out.add_mat_mat_(feats, transform, MatrixTransposeType.NO_TRANS, MatrixTransposeType.TRANS, 1.0, 0.0) elif transform_cols == feat_dim + 1: # Append the implicit 1.0 to the input feature. linear_part = SubMatrix(transform, 0, transform_rows, 0, feat_dim) feats_out.add_mat_mat_(feats, linear_part, MatrixTransposeType.NO_TRANS, MatrixTransposeType.TRANS, 1.0, 0.0) offset = Vector(transform_rows) offset.copy_col_from_mat_(transform, feat_dim) feats_out.add_vec_to_rows_(1.0, offset) else: logging.error(("Transform matrix has bad dimension %dx%d versus feat " "dim %d") % (transform_rows, transform_cols, feat_dim)) return feats_out
def advance_mic_decoding(adaptation_state, asr, asr_client, block, chunks_decoded, feat_info, feat_pipeline, key, last_chunk, part, prev_num_frames_decoded, samp_freq, sil_weighting, speaker, utt): need_endpoint_finalize = False chunks_decoded += 1 # Let the feature pipeline accept the wavform, take block (numpy array) and convert into Kaldi Vector. # This is blocking and Kaldi computes all features updates necessary for the input chunk. feat_pipeline.accept_waveform(samp_freq, Vector(block)) # If this is the last chunk of an utterance, inform feature the pipeline to flush all buffers and finialize all features if last_chunk: feat_pipeline.input_finished() if sil_weighting.active(): sil_weighting.compute_current_traceback(asr.decoder) # inform ivector feature computation about current silence weighting feat_pipeline.ivector_feature().update_frame_weights( sil_weighting.get_delta_weights(feat_pipeline.num_frames_ready())) # This is where we inform Kaldi to advance the decoding pipeline by one step until the input chunk is completely processed. asr.advance_decoding() num_frames_decoded = asr.decoder.num_frames_decoded() # If the endpointing did not indicate that we are in the last chunk: if not last_chunk: # Check if we should set an endpoint for the next chunk if asr.endpoint_detected(): if num_frames_decoded > 0: need_endpoint_finalize = True # prev_num_frames_decoded = 0 # If we do not have decteted an endpoint, check if a new full frame (actually a block of frames) has been decoded and something changed: elif num_frames_decoded > prev_num_frames_decoded: # Get the partial output from the decoder (best path) out = asr.get_partial_output() # Debug output (partial utterance) # print(key + "-utt%d-part%d" % (utt, part), # out["text"], flush=True) # Now send the partial Utterance to the frontend (that then displays it to the user) if asr_client is not None: asr_client.partialUtterance(utterance=out["text"], key=key + "-utt%d-part%d" % (utt, part), speaker=speaker) part += 1 return need_endpoint_finalize, num_frames_decoded, part, utt
def testCuVectorInverElements(self): # Test that this doesnt crash C = CuVector.new_from_size(0) C.invert_elements() C = CuVector.new_from_size(10) C.set_randn() C.invert_elements() # Geometric series r = 1/2, a = 1/2 A = Vector([2, 4, 8, 16, 32, 64]) C = CuVector.new_from_size(len(A)) C.swap(A) C.invert_elements() f1 = C.sum() self.assertAlmostEqual(0.5 * (1 - 0.5**len(A)) / (1 - 0.5), f1)
def write(self, utt_id, counts, posteriors, indices): """Writes posteriors to disk in KALDI format. Arguments: utt_id {string} -- Utterance ID to be written to scp file counts {Tensor} -- Tensor containing the numbers of selected posteriors for each frame posteriors {Tensor} -- Flattened Tensor containing all posteriors indices {Tensor} -- Flattened Tensor containing all Gaussian indices """ counts = counts.numpy() posteriors = posteriors.numpy() indices = indices.numpy() nframes = np.atleast_1d(np.array([counts.size])) datavector = np.hstack([nframes, counts, posteriors, indices]) datavector = Vector(datavector) self.posterior_writer.write(utt_id, datavector)
def read_audio(self, file, sample_rate): file_path = self.TEMP_FILE_PATH + file.filename.lower() file.save(file_path) try: data, sr = librosa.load(file_path, sr=None) if sr != sample_rate: self.log.info('Resample audio file: ' + str(sr) + 'Hz -> ' + str(sample_rate) + 'Hz') data = librosa.resample(data, sr, sample_rate) data = (data * 32767).astype(np.int16) self.dur = len(data) / sample_rate self.data = Vector(data) if not self.SAVE_AUDIO: os.remove(file_path) except Exception as e: self.log.error(e) raise ValueError("The uploaded file format is not supported!!!")
def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: """Get mel-filter bank features via PyKaldi.""" try: from kaldi.feat.mel import MelBanksOptions from kaldi.feat.fbank import FbankOptions, Fbank from kaldi.feat.window import FrameExtractionOptions from kaldi.matrix import Vector mel_opts = MelBanksOptions() mel_opts.num_bins = n_bins frame_opts = FrameExtractionOptions() frame_opts.samp_freq = sample_rate opts = FbankOptions() opts.mel_opts = mel_opts opts.frame_opts = frame_opts fbank = Fbank(opts=opts) features = fbank.compute(Vector(waveform), 1.0).numpy() return features except ImportError: return None
def testCuVectorCopyFromVec(self): # Shouldnt crash A = Vector() C = CuVector.new_from_size(0) C.copy_from_vec(A) # What if dims not match? # HARD-CRASH # FIXME # A = Vector.random(10) # C = CuVector.new_from_size(0) # C.CopyFromVec(A) for i in range(10): dim = 10 * i A = Vector(dim) A.set_randn_() D = CuVector.new_from_size(dim) D.copy_from_vec(A) self.assertEqual(A.sum(), D.sum())
def apply_cepstral_mean_norm(feats: Matrix) -> Matrix: """Apply cepstral mean normalization to MFCCs. Note that this function does not do variance normalization, which is enough for GZ's purposes. Args: feats: A T*D MFCC features. Returns: feats: A T*D Normalized MFCCs. """ mean = Vector(feats.num_cols) mean.add_row_sum_mat_(1.0, feats) mean.scale_(1.0 / feats.num_rows) for i in range(feats.num_rows): feats[i].add_vec_(-1.0, mean) return feats
def gmm_pipeline(feats, utt, min_post=0.025): gselect = gmm.gaussian_selection_matrix(feats, 20)[1] num_frames = feats.num_rows utt_ok = True post = [[] for i in range(num_frames)] tot_loglike = 0 for i in range(num_frames): frame = SubVector(feats.row(i)) this_gselect = gselect[i] log_likes = Vector( fgmm.log_likelihoods_preselect(frame, this_gselect)) tot_loglike += log_likes.apply_softmax_() if (abs(log_likes.sum() - 1.0) > 0.01): utt_ok = False else: if min_post != 0: max_index = log_likes.max_index()[1] for x in range(log_likes.dim): if log_likes[x] < min_post: log_likes[x] = 0.0 if sum(log_likes) == 0: log_likes[max_index] = 1.0 else: log_likes.scale_(1.0 / sum(log_likes)) for x in range(log_likes.dim): if log_likes[x] != 0: post[i].append((this_gselect[x], log_likes[x])) if not utt_ok: LOG.warning( "Skipping utterance because bad posterior-sum encountered (NaN?)" ) return False else: LOG.debug( 'Like/frame for utt {} was {} perframe over {} frames.'.format( utt, tot_loglike / num_frames, num_frames)) return post
def getExampleObj(self): return [Vector([1, 2, 3, 4, 5]), SubVector(Vector([1, 2, 3, 4, 5]))]
def decode_chunked_partial_endpointing_mic( asr, feat_info, decodable_opts, paudio, input_microphone_id, channels=1, samp_freq=16000, record_samplerate=16000, chunk_size=1024, wait_for_start_command=False, record_message_history=False, compute_confidences=True, asr_client=None, speaker_str="Speaker", resample_algorithm="sinc_best", save_debug_wav=False, use_threads=False, minimum_num_frames_decoded_per_speaker=5, mic_vol_cutoff=0.5, use_local_mic=True, decode_control_channel='asr_control', audio_data_channel='asr_audio'): # Subscribe to command and control redis channel p = red.pubsub() p.subscribe(decode_control_channel) if not use_local_mic: pa = red.pubsub() pa.subscribe(audio_data_channel) # Figure out if we need to resample (Todo: channles does not seem to work) need_resample = False if record_samplerate != samp_freq: print( "Activating resampler since record and decode samplerate are different:", record_samplerate, "->", samp_freq) resampler = samplerate.Resampler(resample_algorithm, channels=channels) need_resample = True ratio = samp_freq / record_samplerate print("Resample ratio:", ratio) # Initialize Python/Kaldi bridge print("Constructing decoding pipeline") adaptation_state = OnlineIvectorExtractorAdaptationState.from_info( feat_info.ivector_extractor_info) key = 'mic' + str(input_microphone_id) feat_pipeline, sil_weighting = initNnetFeatPipeline( adaptation_state, asr, decodable_opts, feat_info) print("Done") speaker = speaker_str.replace("#c#", "0") last_chunk = False utt, part = 1, 1 prev_num_frames_decoded, offset_complete = 0, 0 chunks_decoded = 0 num_chunks = 0 blocks = [] rawblocks = [] if use_local_mic: # Open microphone channel print("Open microphone stream with id" + str(input_microphone_id) + "...") stream = paudio.open(format=pyaudio.paInt16, channels=channels, rate=record_samplerate, input=True, frames_per_buffer=chunk_size, input_device_index=input_microphone_id) print("Done!") do_decode = not wait_for_start_command need_finalize = False block, previous_block = None, None decode_future = None # Send event (with redis) to the front that ASR session is ready asr_client.asr_ready(speaker=speaker) # Initialize a ThreadPoolExecutor. # Note that we initialize the thread executer independently of whether we actually use it later (the -t option). # At the end of this loop we have two code paths, one that uses a computation future (with -t) and one without it. with ThreadPoolExecutor(max_workers=1) as executor: while not last_chunk: # Check if there is a message from the redis server first (non-blocking!), if there is no new message msh is simply None. msg = p.get_message() # We check if there are externally send control commands if msg is not None: print('msg:', msg) if msg['data'] == b"start": print('Start command received!') do_decode = True asr_client.sendstatus(isDecoding=do_decode) elif msg['data'] == b"stop": print('Stop command received!') if do_decode and prev_num_frames_decoded > 0: need_finalize = True do_decode = False asr_client.sendstatus(isDecoding=do_decode) elif msg['data'] == b"shutdown": print('Shutdown command received!') last_chunk = True elif msg['data'] == b"status": print('Status command received!') asr_client.sendstatus(isDecoding=do_decode) elif msg['data'] == b"reset_timer": print('Reset time command received!') asr_client.resetTimer() if use_local_mic: # We always consume from the microphone stream, even if we do not decode block_raw = stream.read(chunk_size, exception_on_overflow=False) npblock = np.frombuffer(block_raw, dtype=np.int16) else: block_audio_redis_msg = next(pa.listen()) if block_audio_redis_msg[ 'type'] == "subscribe" and block_audio_redis_msg[ "data"] == 1: print('audio msg:', block_audio_redis_msg) print("Successfully connected to redis audio stream!") continue else: npblock = np.frombuffer(block_audio_redis_msg['data'], dtype=np.int16) # print("audio data: ", npblock) # Resample the block if necessary, e.g. 48kHz -> 16kHz if need_resample: block = resampler.process(np.array(npblock, copy=True), ratio) block = np.array(block, dtype=np.int16) else: block = npblock # Only save the wav, if the save_debug flag is enabled (TODO: investigate: does not seem to work with multiple channels) if save_debug_wav: blocks.append(block) rawblocks.append(npblock) # Block on the result of the decode if one is pending if use_threads and do_decode and block is not None and decode_future is not None: # This call blocks until the result is ready need_endpoint_finalize, prev_num_frames_decoded, part, utt = decode_future.result( ) # Check if we need to finalize, disallow endpoint without a single decoded frame if need_endpoint_finalize and prev_num_frames_decoded > 0: need_finalize = True resend_previous_waveform = True print("prev_num_frames_decoded:", prev_num_frames_decoded) if need_endpoint_finalize and prev_num_frames_decoded == 0: print( "WARN need_endpoint_finalize and prev_num_frames_decoded == 0" ) # Finalize the decoding here, if endpointing signalized that we should start a new utterance. # We might also need to finalize if we switch from do_decode=True to do_decode=False (user starts/stops decoding from frontend). if need_finalize and block is not None and prev_num_frames_decoded > 0: print("prev_num_frames_decoded:", prev_num_frames_decoded) out, confd = finalize_decode(asr, asr_client, key, part, speaker, utt) feat_pipeline, sil_weighting = reinitialize_asr( adaptation_state, asr, feat_info, feat_pipeline) utt += 1 part = 1 if resend_previous_waveform and previous_block is not None: # We always resend the last block for the new utterance (we only know that the endpoint is inside of a chunk, but not where exactly) feat_pipeline.accept_waveform(samp_freq, Vector(previous_block)) resend_previous_waveform = False need_finalize = False prev_num_frames_decoded = 0 # If we operate on multichannel data, select the channel here that has the highest volume # (with some added heuristic, only change the speaker if the previous speaker was active for minimum_num_frames_decoded_per_speaker many frames) if channels > 1: block = np.reshape(block, (-1, channels)) # Select loudest channel volume_norms = [] for i in range(channels): # We have a simplyfied concept of loudness, it is simply the L2 of the chunk interpreted as a vector (sqrt of the sum of squares): # This has nothing to do with the physical loudness. volume_norms.append( np.linalg.norm(block[:, i] / 65536.0) * 10.0) #print("|" * int(volume_norm)) #print('vols:', volume_norms) volume_norms = [ 0.0 if elem < mic_vol_cutoff else elem for elem in volume_norms ] volume_norm = max(volume_norms) max_channel = volume_norms.index(volume_norm) block = block[:, max_channel] new_speaker = speaker_str.replace("#c#", str(max_channel)) #print('vols:',volume_norms, 'max:',max_channel, 'value:',volume_norm) if sum(volume_norms) > 1e-10 and new_speaker != speaker \ and prev_num_frames_decoded >= minimum_num_frames_decoded_per_speaker: print( "Speaker change! Number of frames decoded for previous speaker:", str(prev_num_frames_decoded)) speaker = new_speaker need_finalize = True resend_previous_waveform = True #prev_num_frames_decoded = 0 else: volume_norm = np.linalg.norm(block / 65536.0) * 10.0 num_chunks += 1 # Send status beacon periodically (to frontend, so its knows we are alive) if num_chunks % 50 == 0: asr_client.sendstatus(isDecoding=do_decode) if do_decode: # If we use the unthreaded mode, we block until the computation here in this loop if not use_threads: need_endpoint_finalize, prev_num_frames_decoded, part, utt = advance_mic_decoding( adaptation_state, asr, asr_client, block, chunks_decoded, feat_info, feat_pipeline, key, last_chunk, part, prev_num_frames_decoded, samp_freq, sil_weighting, speaker, utt) # Check if we need to finalize, disallow endpoint without a single decoded frame if need_endpoint_finalize and prev_num_frames_decoded > 0: need_finalize = True resend_previous_waveform = True print("prev_num_frames_decoded:", prev_num_frames_decoded) else: # In threaded mode, we submit a non blocking computation request to the thread executor decode_future = executor.submit( advance_mic_decoding, adaptation_state, asr, asr_client, block, chunks_decoded, feat_info, feat_pipeline, key, last_chunk, part, prev_num_frames_decoded, samp_freq, sil_weighting, speaker, utt) else: time.sleep(0.001) previous_block = block # Record message history as an integrated Python file, that can be used as a standalone replay if record_message_history: with open('message_history_replay.py', 'w') as message_history_out: message_history_out.write(asr_client.message_trace) else: print( "Not writing record message history since --record_message_history is not set." ) # Write debug wav as output file (will only be executed after shutdown) if save_debug_wav: print("Saving debug output...") wavefile.write("debug.wav", samp_freq, np.concatenate(blocks, axis=None)) wavefile.write("debugraw.wav", record_samplerate, np.concatenate(rawblocks, axis=None)) else: print( "Not writing debug wav output since --save_debug_wav is not set.") # Now shuting down pipeline, compute MBR for the final utterance and complete it. print("Shutdown: finalizing ASR output...") asr.finalize_decoding() out = asr.get_output() mbr = MinimumBayesRisk(out["lattice"]) confd = mbr.get_one_best_confidences() print(out) # print(key + "-utt%d-final" % utt, out["text"], flush=True) if asr_client is not None: asr_client.completeUtterance(utterance=out["text"], key=key + "-utt%d-part%d" % (utt, part), confidences=confd, speaker=speaker) asr_client.sendstatus(isDecoding=False, shutdown=True) print("Done, will exit now.")
def compute_vad(wav_rspecifier, feats_wspecifier, opts): """This function computes the vad based on ltsv features. The output is written in the file denoted by feats_wspecifier, and if the test_plot flag is set, it produces a plot. Args: wav_rspecifier: Kaldi specifier for reading wav files. feats_wspecifier: Kaldi wpscifier for writing feature files. opts: Options. See main function for list of options Returns: True if computation was successful for at least one file. False otherwise. """ num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ VectorWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print( "File: {} is too short ({} sec): " "producing no output.".format(key, wave.duration), file=sys.stderr, ) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr, ) continue channel = 0 if opts.channel == -1 else opts.channel fr_length_samples = int(opts.frame_window * wave.samp_freq * (10**(-3))) fr_shift_samples = int(opts.frame_shift * wave.samp_freq * (10**(-3))) assert opts.nfft >= fr_length_samples wav_data = np.squeeze(wave.data()[channel].numpy()) sample_freqs, segment_times, spec = signal.spectrogram( wav_data, fs=wave.samp_freq, nperseg=fr_length_samples, nfft=opts.nfft, noverlap=fr_length_samples - fr_shift_samples, scaling="spectrum", mode="psd", ) specT = np.transpose(spec) spect_n = ARMA.ApplyARMA(specT, opts.arma_order) ltsv_f = LTSV.ApplyLTSV( spect_n, opts.ltsv_ctx_window, opts.threshold, opts.slope, opts.sigmoid_scale, ) vad_feat = DCTF.ApplyDCT(opts.dct_num_cep, opts.dct_ctx_window, ltsv_f) if opts.test_plot: show_plot( key, segment_times, sample_freqs, spec, wave.duration, wav_data, vad_feat, ) writer[key] = Vector(vad_feat) num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print( "Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr, ) return num_success != 0
def testFullGmm(self): dim = 1 + np.random.randint(low=0, high=9) nMix = 1 + np.random.randint(low=0, high=9) print("Testing NumGauss: {}, Dim: {}".format(nMix, dim)) feat = Vector([kaldi_math.rand_gauss() for _ in range(dim)]) weights = Vector([kaldi_math.rand_uniform() for _ in range(nMix)]) tot_weigth = weights.sum() for i, m in enumerate(weights): weights[i] = m / tot_weigth means = Matrix([[kaldi_math.rand_gauss() for _ in range(dim)] for _ in range(nMix)]) invcovars = [SpMatrix(dim) for _ in range(nMix)] covars_logdet = [] for _ in range(nMix): c, matrix_sqrt, logdet_out = RandPosdefSpMatrix(dim) invcovars[_].copy_from_sp_(c) invcovars[_].invert_double_() covars_logdet.append(logdet_out) # Calculate loglike for feature Vector def auxLogLike(w, logdet, mean_row, invcovar): return -0.5 * ( kaldi_math.M_LOG_2PI * dim \ + logdet \ + vec_mat_vec(mean_row, invcovar, mean_row) \ + vec_mat_vec(feat, invcovar, feat)) \ + vec_mat_vec(mean_row, invcovar, feat) \ + np.log(w) loglikes = [ auxLogLike(weights[m], covars_logdet[m], means[m, :], invcovars[m]) for m in range(nMix) ] loglike = Vector(loglikes).log_sum_exp() # new Gmm gmm = FullGmm(nMix, dim) gmm.set_weights(weights) gmm.set_inv_covars_and_means(invcovars, means) gmm.compute_gconsts() loglike1, posterior1 = gmm.component_posteriors(feat) self.assertAlmostEqual(loglike, loglike1, delta=0.01) self.assertAlmostEqual(1.0, posterior1.sum(), delta=0.01) weights_bak = gmm.weights() means_bak = gmm.means() invcovars_bak = gmm.covars() for i in range(nMix): invcovars_bak[i].invert_double_() # Set all params one-by-one to new model gmm2 = FullGmm(gmm.num_gauss(), gmm.dim()) gmm2.set_weights(weights_bak) gmm2.set_means(means_bak) gmm2.inv_covars_ = invcovars_bak gmm2.compute_gconsts() loglike_gmm2 = gmm2.log_likelihood(feat) self.assertAlmostEqual(loglike1, loglike_gmm2, delta=0.01) loglikes = gmm2.log_likelihoods(feat) self.assertAlmostEqual(loglikes.log_sum_exp(), loglike_gmm2) indices = list(range(gmm2.num_gauss())) loglikes = gmm2.log_likelihoods_preselect(feat, indices) self.assertAlmostEqual(loglikes.log_sum_exp(), loglike_gmm2) # Simple component mean accessor + mutator gmm3 = FullGmm(gmm.num_gauss(), gmm.dim()) gmm3.set_weights(weights_bak) means_bak.set_zero_() for i in range(nMix): gmm.get_component_mean(i, means_bak[i, :]) gmm3.set_means(means_bak) gmm3.inv_covars_ = invcovars_bak gmm3.compute_gconsts() loglike_gmm3 = gmm3.log_likelihood(feat) self.assertAlmostEqual(loglike1, loglike_gmm3, delta=0.01) gmm4 = FullGmm(gmm.num_gauss(), gmm.dim()) gmm4.set_weights(weights_bak) invcovars_bak, means_bak = gmm.get_covars_and_means() for i in range(nMix): invcovars_bak[i].invert_double_() gmm4.set_inv_covars_and_means(invcovars_bak, means_bak) gmm4.compute_gconsts() loglike_gmm4 = gmm4.log_likelihood(feat) self.assertAlmostEqual(loglike1, loglike_gmm4, delta=0.01) # TODO: I/O tests # CopyFromFullGmm gmm4 = FullGmm() gmm4.copy_from_full(gmm) loglike5, _ = gmm4.component_posteriors(feat) self.assertAlmostEqual(loglike, loglike5, delta=0.01) # CopyFromDiag gmm_diag = DiagGmm(nMix, dim) init_rand_diag_gmm(gmm_diag) loglike_diag = gmm_diag.log_likelihood(feat) gmm_full = FullGmm().copy(gmm_diag) loglike_full = gmm_full.log_likelihood(feat) gmm_diag2 = DiagGmm().copy(gmm_full) loglike_diag2 = gmm_diag2.log_likelihood(feat) self.assertAlmostEqual(loglike_diag, loglike_full, delta=0.01) self.assertAlmostEqual(loglike_diag, loglike_diag2, delta=0.01)
def compute_vad(wav_rspecifier, feats_wspecifier, opts): """This function computes the vad based on ltsv features. The output is written in the file denoted by feats_wspecifier, and if the test_plot flaf is set, it produces a plot. Args: wav_rspecifier: An ark or scp file as in Kaldi, that contains the input audio feats_wspecifier: An ark or scp file as in Kaldi, that contains the input audio opts: Options. See main function for list of options Returns: The number of successful trials. """ num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ VectorWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print("File: {} is too short ({} sec): producing no output.". format(key, wave.duration), file=sys.stderr) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr) continue channel = 0 if opts.channel == -1 else opts.channel fr_length_samples = int(opts.frame_window * wave.samp_freq * (10**(-3))) fr_shift_samples = int(opts.frame_shift * wave.samp_freq * (10**(-3))) try: wav_data = np.squeeze(wave.data()[channel].numpy()) sample_freqs, segment_times, spec = signal.spectrogram( wav_data, fs=wave.samp_freq, nperseg=fr_length_samples, nfft=opts.nfft, noverlap=fr_length_samples - fr_shift_samples, scaling='spectrum', mode='psd') specT = np.transpose(spec) spect_n = ARMA.ApplyARMA(specT, opts.arma_order) ltsv_f = LTSV.ApplyLTSV(spect_n, opts.ltsv_ctx_window, opts.threshold, opts.slope, opts.sigmoid_scale) vad_feat = DCTF.ApplyDCT(opts.dct_num_cep, opts.dct_ctx_window, ltsv_f) feats = Vector(vad_feat) if opts.test_plot: show_plot(segment_times, sample_freqs, spec, wave, wav_data, vad_feat) except: print("Failed to compute features for utterance", key, file=sys.stderr) continue writer[key] = feats num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) return num_success != 0
def getExampleObj(self): return Vector([1, 2, 3, 4, 5])
mrk_fn = line.split()[0] seq_fn = line.split()[1] with open(mrk_fn, 'r', encoding='utf-8') as mrk, \ open(seq_fn, 'rb') as seq: for mrk_line in mrk: seq.seek(int(mrk_line.split()[1])) num_bytes = int(mrk_line.split()[2]) #this is making sure even number of bytes num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') audio_seg = AudioSegment(audio_np, args.sample_rate) spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) #-55 to -10 db audio_seg.normalize(np.random.uniform(-55, -10)) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) if args.cmn: feats = _matrix_ext.matrix_to_numpy(feats) feats -= np.mean(feats, axis=0) feats = Matrix(feats) cmvn.accumulate(feats) cmvn.write_stats(args.cmvn_stats, binary=False)
def otf_utt_generator(data_triplets, rir, noise, args): """ Args: data_lst: list of mrk and seq of input audios, and label ark rir: list of rir, List[AudioSegment] noise: list of noise, List[AudioSegment] args: argumnets for loader """ max_len = args.max_len batch_size = args.batch_size data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)), dtype=np.float32) target_buffer = np.zeros((batch_size, max_len), dtype=np.int32) len_buffer = np.zeros(batch_size, dtype=np.int32) ali_len = np.zeros(batch_size, dtype=np.int32) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 #rates for speed perturbation speed_rate = [float(rate) for rate in args.speed_rate.split(',')] #volume level perturbation gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')] #snr range for noise perturbation: 0-20db with mean of 10 #mu, sigma = 10, 10 #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma #Fbank config po = ParseOptions('') fbank_opt = FbankOptions() fbank_opt.register(po) #fbank_opt = MfccOptions() #fbank_opt.register(po) po.read_config_file(args.feat_config) fbank = Fbank(fbank_opt) #fbank = Mfcc(fbank_opt) for data_triplet in data_triplets: mrk_fn, seq_fn = data_triplet[0], data_triplet[1] ali_rspec = data_triplet[2] with open(mrk_fn, 'r', encoding='utf-8') as mrk,\ open(seq_fn, 'rb') as seq: ali_reader = SequentialIntVectorReader(ali_rspec) for line, (uttid1, ali) in zip(mrk, ali_reader): uttid = line.split()[0] assert uttid == uttid1 seq.seek(int(line.split()[1])) num_bytes = int(line.split()[2]) num_bytes -= num_bytes % 2 audio_bytes = seq.read(num_bytes) audio_np = np.frombuffer(audio_bytes, dtype='int16') #data augmentation function goes here audio_seg = AudioSegment(audio_np, args.sample_rate) #speed perturbation spr = speed_rate[randint(0, len(speed_rate) - 1)] audio_seg.change_speed(spr) audio_seg.normalize(np.random.uniform(gain_lo, gain_hi)) #noise adding example: #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1) #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr) #rir adding example: #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)]) audio_np = audio_seg._convert_samples_from_float32(\ audio_seg.samples, 'int16') wave_1ch = Vector(audio_np) feats = fbank.compute_features(wave_1ch, args.sample_rate, vtnl_warp=1.0) ali = np.array(ali) if args.reverse_labels: ali = ali[::-1] if args.SOS >= 0: ali = np.concatenate(([args.SOS], ali)) if args.EOS >= 0: ali = np.concatenate((ali, [args.EOS])) feats = _matrix_ext.matrix_to_numpy(feats) utt_len = feats.shape[0] // args.stride + \ int(feats.shape[0] % args.stride != 0) #limits on T*U products due to RNNT. #this is pretty hacky now if ali.shape[0] * utt_len // 3 <= args.TU_limit: ali_len[valid_idx] = ali.shape[0] data_buffer[valid_idx, :utt_len, :] = \ splice(feats, args.lctx, args.rctx)[::args.stride] target_buffer[valid_idx, :ali_len[valid_idx]] = ali len_buffer[valid_idx] = utt_len if utt_len > batch_max_len: batch_max_len = utt_len if ali_len[valid_idx] > target_max_len: target_max_len = ali_len[valid_idx] valid_idx += 1 batch_idx += 1 if batch_idx == batch_size: for b in range(valid_idx): utt_len = len_buffer[b] target_len = ali_len[b] #data and target padding if utt_len > 0: data_buffer[b, utt_len:batch_max_len, :] = \ data_buffer[b, utt_len-1, :] target_buffer[b, target_len:target_max_len] = \ args.padding_tgt data = data_buffer[:valid_idx, :batch_max_len, :] target = target_buffer[:valid_idx, :target_max_len] if not args.batch_first: data = np.transpose(data, (1, 0, 2)) target = np.transpose(target, (1, 0)) data = torch.from_numpy(np.copy(data)) target = torch.from_numpy(np.copy(target)) lens = torch.from_numpy(np.copy(len_buffer[:valid_idx])) ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx])) if valid_idx > 0: #not doing cuda() here, in main process instead yield data, target, lens, ali_lens else: yield None, None, \ torch.IntTensor([0]), torch.IntTensor([0]) batch_idx = 0 valid_idx = 0 target_len = 0 batch_max_len = -1 target_max_len = -1 ali_reader.close() yield None
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts): mfcc = Mfcc(mfcc_opts) if opts.vtln_map: vtln_map_reader = RandomAccessFloatReaderMapped( opts.vtln_map, opts.utt2spk) elif opts.utt2spk: print("utt2spk option is needed only if vtln-map option is specified.", file=sys.stderr) num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ MatrixWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print("File: {} is too short ({} sec): producing no output.". format(key, wave.duration), file=sys.stderr) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr) continue channel = 0 if opts.channel == -1 else opts.channel if opts.vtln_map: if key not in vtln_map_reader: print("No vtln-map entry for utterance-id (or speaker-id)", key, file=sys.stderr) continue vtln_warp = vtln_map_reader[key] else: vtln_warp = opts.vtln_warp try: feats = mfcc.compute_features(wave.data()[channel], wave.samp_freq, vtln_warp) except: print("Failed to compute features for utterance", key, file=sys.stderr) continue if opts.subtract_mean: mean = Vector(feats.num_cols) mean.add_row_sum_mat_(1.0, feats) mean.scale_(1.0 / feats.num_rows) for i in range(feats.num_rows): feats[i].add_vec_(-1.0, mean) writer[key] = feats num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) if opts.vtln_map: vtln_map_reader.close() return num_success != 0