Python Vector Exemples, kaldi.matrix.Vector Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : compute_ppg.py Projet : taalua/Adversarial-Many-to-Many-VC

def compute_full_ppg(nnet: Nnet, feats: Matrix) -> Matrix:
    """Compute full PPG features given appropriate input features.

    Args:
        nnet: An neural network AM.
        feats: Suitable T*D input feature matrix.

    Returns:
        raw_ppgs: T*K raw PPGs, K is the number of senones.
    """
    # Obtain the nnet computer, for some unknown reason, the computer must be
    # constructed within this function.
    nnet3.set_batchnorm_test_mode(True, nnet)
    nnet3.set_dropout_test_mode(True, nnet)
    nnet3.collapse_model(nnet3.CollapseModelConfig(), nnet)
    opts = nnet3.NnetSimpleComputationOptions()
    opts.acoustic_scale = 1.0
    compiler = nnet3.CachingOptimizingCompiler. \
        new_with_optimize_opts(nnet, opts.optimize_config)
    priors = Vector()  # We do not need prior
    nnet_computer = nnet3.DecodableNnetSimple(opts, nnet, priors, feats,
                                              compiler)
    # Obtain frame-level PPGs
    raw_ppgs = Matrix(nnet_computer.num_frames(), nnet_computer.output_dim())
    for i in range(nnet_computer.num_frames()):
        temp = Vector(nnet_computer.output_dim())
        nnet_computer.get_output_for_frame(i, temp)
        raw_ppgs.copy_row_from_vec_(temp, i)
    return raw_ppgs

Exemple #2

0

Afficher le fichier

Fichier : tools.py Projet : thomascherickal/linto-platform-stt-standalone-worker

    def computeVAD_KALDI(self, feats):
        try:
            vadStream = compute_vad_energy(self.vad_ops, feats)
            vad = Vector(vadStream)
            VAD = vad.numpy()

            #  segmentation
            occurence = []
            value = []
            occurence.append(1)
            value.append(VAD[0])

            # compute the speech and non-speech frames
            for i in range(1, len(VAD)):
                if value[-1] == VAD[i]:
                    occurence[-1] += 1
                else:
                    occurence.append(1)
                    value.append(VAD[i])

            # filter the speech and non-speech segments that are below 30 frames
            i = 0
            while(i < len(occurence)):
                if i != 0 and (occurence[i] < 30 or value[i-1] == value[i]):
                    occurence[i-1] += occurence[i]
                    del value[i]
                    del occurence[i]
                else:
                    i += 1

            # split if and only if the silence is above 50 frames
            i = 0
            while(i < len(occurence)):
                if i != 0 and ((occurence[i] < 30 and value[i] == 0.0) or value[i-1] == value[i]):
                    occurence[i-1] += occurence[i]
                    del value[i]
                    del occurence[i]
                else:
                    i += 1

            # compute VAD mask
            maskSAD = np.zeros(len(VAD))
            start = 0
            for i in range(len(occurence)):
                if value[i] == 1.0:
                    end = start+occurence[i]
                    maskSAD[start:end] = 1
                    start = end
                else:
                    start += occurence[i]

            maskSAD = np.expand_dims(maskSAD, axis=0)
        except ValueError as v:
            self.log.error(v)
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "Speaker diarization failed while voice activity detection!!!")
        else:
            return maskSAD

Exemple #3

0

Afficher le fichier

Fichier : __init__.py Projet : BRASLab/VAWSR_backend

    def feat_pipeline(vec, freq):
        feats = base.compute_features(vec, freq, 1.0)

        voice = Vector(compute_vad_energy(
            vad_opts, feats))  # Use origin mfcc to computed

        delta_feats = compute_deltas(delta_opts, feats)

        sliding_feats = Matrix(delta_feats.num_rows, delta_feats.num_cols)
        sliding_window_cmn(sliding_opts, delta_feats, sliding_feats)

        if not voice.sum():
            LOG.warning('No features were judged as voiced for utterance')
            return False

        dim = int(voice.sum())
        voice_feats = Matrix(dim, delta_feats.num_cols)
        feats = kaldi_Matrix(sliding_feats)

        index = 0
        for i, sub_vec in enumerate(feats):
            if voice[i] != 0 and voice[i] == 1:
                voice_feats.row(index).copy_row_from_mat_(feats, i)
                index += 1

        LOG.debug('Feats extract successed')
        return voice_feats

Exemple #4

0

Afficher le fichier

    def testCuVectorSwap(self):
        N = [2, 3, 5, 7, 13]
        A = Vector(N).clone()
        C = CuVector.new_from_size(5)
        C.swap(A)  #Swap *is* destructive

        self.assertEqual(16.0, C.norm(2))

        A = Vector()
        C = CuVector.new_from_size(0)
        C.swap(A)
        self.assertEqual(0.0, C.norm(2))

Exemple #5

0

Afficher le fichier

    def test_nnet_decodable(self):
        gen_config = NnetGenerationOptions()
        configs = generate_config_sequence(gen_config)
        nnet = Nnet()
        for j, config in enumerate(configs):
            print("Input config[{}]:".format(j))
            print(config)
            istrm = istringstream.from_str(config)
            nnet.read_config(istrm)

        num_frames = 5 + random.randint(1, 100)
        input_dim = nnet.input_dim("input")
        output_dim = nnet.output_dim("output")
        ivector_dim = max(0, nnet.input_dim("ivector"))
        input = Matrix(num_frames, input_dim)

        set_batchnorm_test_mode(True, nnet)
        set_dropout_test_mode(True, nnet)

        input.set_randn_()
        ivector = Vector(ivector_dim)
        ivector.set_randn_()

        priors = Vector(output_dim if random.choice([True, False]) else 0)
        if len(priors) != 0:
            priors.set_randn_()
            priors.apply_exp_()

        output1 = Matrix(num_frames, output_dim)
        output2 = Matrix(num_frames, output_dim)

        opts = NnetSimpleComputationOptions()
        opts.frames_per_chunk = random.randint(5, 25)
        compiler = CachingOptimizingCompiler(nnet)
        decodable = DecodableNnetSimple(opts, nnet, priors, input, compiler,
                                        ivector if ivector_dim else None)
        for t in range(num_frames):
            decodable.get_output_for_frame(t, output1[t])

        opts = NnetSimpleLoopedComputationOptions()
        info = DecodableNnetSimpleLoopedInfo.new_from_priors(
            opts, priors, nnet)
        decodable = DecodableNnetSimpleLooped(info, input,
                                              ivector if ivector_dim else None)
        for t in range(num_frames):
            decodable.get_output_for_frame(t, output2[t])

        if (not nnet_is_recurrent(nnet)
                and nnet.info().find("statistics-extraction") == -1
                and nnet.info().find("TimeHeightConvolutionComponent") == -1):
            for t in range(num_frames):
                self.assertTrue(approx_equal(output1[t], output2[t]))

Exemple #6

0

Afficher le fichier

def init_rand_diag_gmm(gmm):
    num_comp, dim = gmm.num_gauss(), gmm.dim()
    weights = Vector([kaldi_math.rand_uniform() for _ in range(num_comp)])
    tot_weigth = weights.sum()

    for i, m in enumerate(weights):
        weights[i] = m / tot_weigth

    means = Matrix([[kaldi_math.rand_gauss() for _ in range(dim)] for _ in range(num_comp)])
    vars_ = Matrix([[kaldi_math.exp(kaldi_math.rand_gauss()) for _ in range(dim)] for _ in range(num_comp)])
    vars_.invert_elements_()
    gmm.set_weights(weights)
    gmm.set_inv_vars_and_means(vars_, means)
    gmm.perturb(0.5 * kaldi_math.rand_uniform())
    gmm.compute_gconsts()

Exemple #7

0

Afficher le fichier

Fichier : feat.py Projet : narendranp/fac-via-ppg-incomplete

def read_wav_kaldi_internal(wav, fs) -> WaveData:
    """Internal function for converting wave data to Kaldi format.

    This function will only keep the first channel.

    Args:
        wav: S*C ndarray. S is number of samples and C is number of channels.
        fs: Sampling frequency.

    Returns:
        wd: A Kaldi-readable WaveData object.
    """
    # Only keep the first channel if more than one
    if wav.ndim >= 2:
        wav = wav[:, 0]

    # Save to a Kaldi matrix, per Kaldi's requirement.
    wav_kaldi = Matrix(1, len(wav))
    wav_kaldi.copy_rows_from_vec_(Vector(wav))

    if hasattr(WaveData, 'new'):
        wd = WaveData.new(fs, wav_kaldi)
    elif hasattr(WaveData, 'from_data'):
        wd = WaveData.from_data(fs, wav_kaldi)
    else:
        wd = None
        logging.error('Unknown Pykaldi package.')
    return wd

Exemple #8

0

Afficher le fichier

Fichier : trace_old.py Projet : shahinkl/espnet

def compute_pitch_feats_and_post(data):
    pitch_opts = PitchExtractionOptions()
    post_opts = ProcessPitchOptions()
    wav_vector = Vector(data)
    feats = compute_and_process_kaldi_pitch(pitch_opts, post_opts, wav_vector)
    feats_data = feats.numpy()
    return feats_data

Exemple #9

0

Afficher le fichier

def post_to_count(feature_rspecifier, cnt_wspecifier, normalize=False, per_utt=False):
  with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
          VectorWriter(cnt_wspecifier) as cnt_writer:
      if per_utt:
        for uttid, feat in feature_reader:
          cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0))
      else:
        vec = 0
        num_done = 0
        for uttid, feat in feature_reader:
          vec = vec + feat.numpy().mean(axis=0)
          num_done = num_done + 1
        if normalize:
          vec = vec / num_done
        cnt_writer[str(num_done)] = Vector(vec)
  return True

Exemple #10

0

Afficher le fichier

Fichier : vector-test.py Projet : prismdata/kor_asr

    def add_vec_(self):
        v = self.vector_class(5).set_randn_()
        v1 = self.vector_class(5).set_zero_()
        self.assertEqual(v, v.add_vec_(v1))

        v1 = v1.set_randn_()
        self.assertNotEqual(v, v.add_vec_(v1))

        v1 = v.clone()
        v1 = v1.scale_(-1.0)
        self.assertEqual(Vector(5), v.add_vec_(1.0, v1))

Exemple #11

0

Afficher le fichier

Fichier : post-count.py Projet : JerryPeng21cuhk/mFVAE

def feat_to_count(feature_rspecifier,
                  cnt_wspecifier,
                  normalize=False,
                  per_utt=False):
    with SequentialMatrixReader(feature_rspecifier) as feature_reader, \
            VectorWriter(cnt_wspecifier) as cnt_writer:
        if per_utt:
            for uttid, feat in feature_reader:
                cnt_writer[uttid] = Vector(feat.numpy().mean(axis=0))
        else:
            vec = 0
            num_done = 0
            for uttid, feat in feature_reader:
                vec = vec + feat.numpy().mean(axis=0)
                num_done = num_done + 1
            if normalize:
                vec = vec / num_done
            # post = zip(range(len(vec)), vec.tolist())
            # posterior_writer[str(num_done)] = Posterior().from_posteriors([post])
            cnt_writer[str(num_done)] = Vector(vec)
    return True

Exemple #12

0

Afficher le fichier

    def write(self, key, value):
        """Writes the `(key, value)` pair to the table.

        This method is provided for compatibility with the C++ API only;
        most users should use the Pythonic API.
        
        Overrides write to accept both Vector and SubVector.
        
        Args:
            key (str): The key.
            value: The value.
        """
        super(VectorWriter, self).write(key, Vector(value))

Exemple #13

0

Afficher le fichier

Fichier : feat.py Projet : narendranp/fac-via-ppg-incomplete

def apply_feat_transform(feats: Matrix, transform: Matrix) -> Matrix:
    """Apply an LDA/fMLLR transform on the input features.

    The transform is a simple matrix multiplication: F = FT' (' is transpose) in
    the case of LDA. For fMLLR, please see
    http://kaldi-asr.org/doc/transform.html#transform_cmllr_global
    This function is an extremely simplified version of
    https://github.com/kaldi-asr/kaldi/blob/5.3/src/featbin/transform-feats.cc

    Args:
        feats: A T*D feature matrix.
        transform: A D'*D matrix, where D' is the output feature dim.

    Returns:
        feats_out: A T*D' matrix.
    """
    feat_dim = feats.num_cols
    transform_rows = transform.num_rows
    transform_cols = transform.num_cols

    feats_out = Matrix(feats.num_rows, transform_rows)
    if transform_cols == feat_dim:
        feats_out.add_mat_mat_(feats, transform, MatrixTransposeType.NO_TRANS,
                               MatrixTransposeType.TRANS, 1.0, 0.0)
    elif transform_cols == feat_dim + 1:
        # Append the implicit 1.0 to the input feature.
        linear_part = SubMatrix(transform, 0, transform_rows, 0, feat_dim)
        feats_out.add_mat_mat_(feats, linear_part,
                               MatrixTransposeType.NO_TRANS,
                               MatrixTransposeType.TRANS, 1.0, 0.0)
        offset = Vector(transform_rows)
        offset.copy_col_from_mat_(transform, feat_dim)
        feats_out.add_vec_to_rows_(1.0, offset)
    else:
        logging.error(("Transform matrix has bad dimension %dx%d versus feat "
                       "dim %d") % (transform_rows, transform_cols, feat_dim))
    return feats_out

Exemple #14

0

Afficher le fichier

Fichier : nnet3_model.py Projet : 3wille/kaldi-model-server

def advance_mic_decoding(adaptation_state, asr, asr_client, block,
                         chunks_decoded, feat_info, feat_pipeline, key,
                         last_chunk, part, prev_num_frames_decoded, samp_freq,
                         sil_weighting, speaker, utt):
    need_endpoint_finalize = False
    chunks_decoded += 1

    # Let the feature pipeline accept the wavform, take block (numpy array) and convert into Kaldi Vector.
    # This is blocking and Kaldi computes all features updates necessary for the input chunk.
    feat_pipeline.accept_waveform(samp_freq, Vector(block))

    # If this is the last chunk of an utterance, inform feature the pipeline to flush all buffers and finialize all features
    if last_chunk:
        feat_pipeline.input_finished()
    if sil_weighting.active():
        sil_weighting.compute_current_traceback(asr.decoder)

        # inform ivector feature computation about current silence weighting
        feat_pipeline.ivector_feature().update_frame_weights(
            sil_weighting.get_delta_weights(feat_pipeline.num_frames_ready()))

    # This is where we inform Kaldi to advance the decoding pipeline by one step until the input chunk is completely processed.
    asr.advance_decoding()
    num_frames_decoded = asr.decoder.num_frames_decoded()

    # If the endpointing did not indicate that we are in the last chunk:
    if not last_chunk:
        # Check if we should set an endpoint for the next chunk
        if asr.endpoint_detected():
            if num_frames_decoded > 0:
                need_endpoint_finalize = True
            #    prev_num_frames_decoded = 0
        # If we do not have decteted an endpoint, check if a new full frame (actually a block of frames) has been decoded and something changed:
        elif num_frames_decoded > prev_num_frames_decoded:
            # Get the partial output from the decoder (best path)
            out = asr.get_partial_output()

            # Debug output (partial utterance)
            # print(key + "-utt%d-part%d" % (utt, part),
            #       out["text"], flush=True)
            # Now send the partial Utterance to the frontend (that then displays it to the user)
            if asr_client is not None:
                asr_client.partialUtterance(utterance=out["text"],
                                            key=key + "-utt%d-part%d" %
                                            (utt, part),
                                            speaker=speaker)
            part += 1
    return need_endpoint_finalize, num_frames_decoded, part, utt

Exemple #15

0

Afficher le fichier

    def testCuVectorInverElements(self):
        # Test that this doesnt crash
        C = CuVector.new_from_size(0)
        C.invert_elements()

        C = CuVector.new_from_size(10)
        C.set_randn()
        C.invert_elements()

        # Geometric series r = 1/2, a = 1/2
        A = Vector([2, 4, 8, 16, 32, 64])
        C = CuVector.new_from_size(len(A))
        C.swap(A)
        C.invert_elements()

        f1 = C.sum()
        self.assertAlmostEqual(0.5 * (1 - 0.5**len(A)) / (1 - 0.5), f1)

Exemple #16

0

Afficher le fichier

    def write(self, utt_id, counts, posteriors, indices):
        """Writes posteriors to disk in KALDI format.
        
        Arguments:
            utt_id {string} -- Utterance ID to be written to scp file
            counts {Tensor} -- Tensor containing the numbers of selected posteriors for each frame
            posteriors {Tensor} -- Flattened Tensor containing all posteriors
            indices {Tensor} -- Flattened Tensor containing all Gaussian indices
        """

        counts = counts.numpy()
        posteriors = posteriors.numpy()
        indices = indices.numpy()
        nframes = np.atleast_1d(np.array([counts.size]))
        datavector = np.hstack([nframes, counts, posteriors, indices])
        datavector = Vector(datavector)
        self.posterior_writer.write(utt_id, datavector)

Exemple #17

0

Afficher le fichier

    def read_audio(self, file, sample_rate):
        file_path = self.TEMP_FILE_PATH + file.filename.lower()
        file.save(file_path)
        try:
            data, sr = librosa.load(file_path, sr=None)
            if sr != sample_rate:
                self.log.info('Resample audio file: ' + str(sr) + 'Hz -> ' +
                              str(sample_rate) + 'Hz')
                data = librosa.resample(data, sr, sample_rate)
            data = (data * 32767).astype(np.int16)
            self.dur = len(data) / sample_rate
            self.data = Vector(data)

            if not self.SAVE_AUDIO:
                os.remove(file_path)
        except Exception as e:
            self.log.error(e)
            raise ValueError("The uploaded file format is not supported!!!")

Exemple #18

0

Afficher le fichier

Fichier : audio_utils.py Projet : veralily/fairseq

def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]:
    """Get mel-filter bank features via PyKaldi."""
    try:
        from kaldi.feat.mel import MelBanksOptions
        from kaldi.feat.fbank import FbankOptions, Fbank
        from kaldi.feat.window import FrameExtractionOptions
        from kaldi.matrix import Vector

        mel_opts = MelBanksOptions()
        mel_opts.num_bins = n_bins
        frame_opts = FrameExtractionOptions()
        frame_opts.samp_freq = sample_rate
        opts = FbankOptions()
        opts.mel_opts = mel_opts
        opts.frame_opts = frame_opts
        fbank = Fbank(opts=opts)
        features = fbank.compute(Vector(waveform), 1.0).numpy()
        return features
    except ImportError:
        return None

Exemple #19

0

Afficher le fichier

    def testCuVectorCopyFromVec(self):

        # Shouldnt crash
        A = Vector()
        C = CuVector.new_from_size(0)
        C.copy_from_vec(A)

        # What if dims not match?
        # HARD-CRASH
        # FIXME
        # A = Vector.random(10)
        # C = CuVector.new_from_size(0)
        # C.CopyFromVec(A)

        for i in range(10):
            dim = 10 * i
            A = Vector(dim)
            A.set_randn_()
            D = CuVector.new_from_size(dim)
            D.copy_from_vec(A)
            self.assertEqual(A.sum(), D.sum())

Exemple #20

0

Afficher le fichier

Fichier : feat.py Projet : narendranp/fac-via-ppg-incomplete

def apply_cepstral_mean_norm(feats: Matrix) -> Matrix:
    """Apply cepstral mean normalization to MFCCs.

    Note that this function does not do variance normalization, which is enough
    for GZ's purposes.

    Args:
        feats: A T*D MFCC features.

    Returns:
        feats: A T*D Normalized MFCCs.
    """
    mean = Vector(feats.num_cols)
    mean.add_row_sum_mat_(1.0, feats)
    mean.scale_(1.0 / feats.num_rows)
    for i in range(feats.num_rows):
        feats[i].add_vec_(-1.0, mean)
    return feats

Exemple #21

0

Afficher le fichier

Fichier : __init__.py Projet : BRASLab/VAWSR_backend

    def gmm_pipeline(feats, utt, min_post=0.025):
        gselect = gmm.gaussian_selection_matrix(feats, 20)[1]
        num_frames = feats.num_rows
        utt_ok = True
        post = [[] for i in range(num_frames)]
        tot_loglike = 0
        for i in range(num_frames):
            frame = SubVector(feats.row(i))
            this_gselect = gselect[i]
            log_likes = Vector(
                fgmm.log_likelihoods_preselect(frame, this_gselect))
            tot_loglike += log_likes.apply_softmax_()
            if (abs(log_likes.sum() - 1.0) > 0.01):
                utt_ok = False
            else:
                if min_post != 0:
                    max_index = log_likes.max_index()[1]
                    for x in range(log_likes.dim):
                        if log_likes[x] < min_post:
                            log_likes[x] = 0.0
                    if sum(log_likes) == 0:
                        log_likes[max_index] = 1.0
                    else:
                        log_likes.scale_(1.0 / sum(log_likes))
            for x in range(log_likes.dim):
                if log_likes[x] != 0:
                    post[i].append((this_gselect[x], log_likes[x]))

        if not utt_ok:
            LOG.warning(
                "Skipping utterance because bad posterior-sum encountered (NaN?)"
            )
            return False
        else:
            LOG.debug(
                'Like/frame for utt {} was {} perframe over {} frames.'.format(
                    utt, tot_loglike / num_frames, num_frames))

        return post

Exemple #22

0

Afficher le fichier

 def getExampleObj(self):
     return [Vector([1, 2, 3, 4, 5]),
             SubVector(Vector([1, 2, 3, 4, 5]))]

Exemple #23

0

Afficher le fichier

Fichier : nnet3_model.py Projet : 3wille/kaldi-model-server

def decode_chunked_partial_endpointing_mic(
        asr,
        feat_info,
        decodable_opts,
        paudio,
        input_microphone_id,
        channels=1,
        samp_freq=16000,
        record_samplerate=16000,
        chunk_size=1024,
        wait_for_start_command=False,
        record_message_history=False,
        compute_confidences=True,
        asr_client=None,
        speaker_str="Speaker",
        resample_algorithm="sinc_best",
        save_debug_wav=False,
        use_threads=False,
        minimum_num_frames_decoded_per_speaker=5,
        mic_vol_cutoff=0.5,
        use_local_mic=True,
        decode_control_channel='asr_control',
        audio_data_channel='asr_audio'):

    # Subscribe to command and control redis channel
    p = red.pubsub()
    p.subscribe(decode_control_channel)

    if not use_local_mic:
        pa = red.pubsub()
        pa.subscribe(audio_data_channel)

    # Figure out if we need to resample (Todo: channles does not seem to work)
    need_resample = False
    if record_samplerate != samp_freq:
        print(
            "Activating resampler since record and decode samplerate are different:",
            record_samplerate, "->", samp_freq)
        resampler = samplerate.Resampler(resample_algorithm, channels=channels)
        need_resample = True
        ratio = samp_freq / record_samplerate
        print("Resample ratio:", ratio)

    # Initialize Python/Kaldi bridge
    print("Constructing decoding pipeline")
    adaptation_state = OnlineIvectorExtractorAdaptationState.from_info(
        feat_info.ivector_extractor_info)
    key = 'mic' + str(input_microphone_id)
    feat_pipeline, sil_weighting = initNnetFeatPipeline(
        adaptation_state, asr, decodable_opts, feat_info)
    print("Done")

    speaker = speaker_str.replace("#c#", "0")
    last_chunk = False
    utt, part = 1, 1
    prev_num_frames_decoded, offset_complete = 0, 0
    chunks_decoded = 0
    num_chunks = 0
    blocks = []
    rawblocks = []

    if use_local_mic:
        # Open microphone channel
        print("Open microphone stream with id" + str(input_microphone_id) +
              "...")
        stream = paudio.open(format=pyaudio.paInt16,
                             channels=channels,
                             rate=record_samplerate,
                             input=True,
                             frames_per_buffer=chunk_size,
                             input_device_index=input_microphone_id)
        print("Done!")

    do_decode = not wait_for_start_command
    need_finalize = False
    block, previous_block = None, None
    decode_future = None

    # Send event (with redis) to the front that ASR session is ready
    asr_client.asr_ready(speaker=speaker)

    # Initialize a ThreadPoolExecutor.
    # Note that we initialize the thread executer independently of whether we actually use it later (the -t option).
    # At the end of this loop we have two code paths, one that uses a computation future (with -t) and one without it.
    with ThreadPoolExecutor(max_workers=1) as executor:
        while not last_chunk:
            # Check if there is a message from the redis server first (non-blocking!), if there is no new message msh is simply None.
            msg = p.get_message()

            # We check if there are externally send control commands
            if msg is not None:
                print('msg:', msg)
                if msg['data'] == b"start":
                    print('Start command received!')
                    do_decode = True
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"stop":
                    print('Stop command received!')
                    if do_decode and prev_num_frames_decoded > 0:
                        need_finalize = True
                    do_decode = False
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"shutdown":
                    print('Shutdown command received!')
                    last_chunk = True

                elif msg['data'] == b"status":
                    print('Status command received!')
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"reset_timer":
                    print('Reset time command received!')
                    asr_client.resetTimer()

            if use_local_mic:
                # We always consume from the microphone stream, even if we do not decode
                block_raw = stream.read(chunk_size,
                                        exception_on_overflow=False)
                npblock = np.frombuffer(block_raw, dtype=np.int16)
            else:
                block_audio_redis_msg = next(pa.listen())
                if block_audio_redis_msg[
                        'type'] == "subscribe" and block_audio_redis_msg[
                            "data"] == 1:
                    print('audio msg:', block_audio_redis_msg)
                    print("Successfully connected to redis audio stream!")
                    continue
                else:
                    npblock = np.frombuffer(block_audio_redis_msg['data'],
                                            dtype=np.int16)
                    # print("audio data: ", npblock)

            # Resample the block if necessary, e.g. 48kHz -> 16kHz
            if need_resample:
                block = resampler.process(np.array(npblock, copy=True), ratio)
                block = np.array(block, dtype=np.int16)
            else:
                block = npblock

            # Only save the wav, if the save_debug flag is enabled (TODO: investigate: does not seem to work with multiple channels)
            if save_debug_wav:
                blocks.append(block)
                rawblocks.append(npblock)

            # Block on the result of the decode if one is pending
            if use_threads and do_decode and block is not None and decode_future is not None:

                # This call blocks until the result is ready
                need_endpoint_finalize, prev_num_frames_decoded, part, utt = decode_future.result(
                )

                # Check if we need to finalize, disallow endpoint without a single decoded frame
                if need_endpoint_finalize and prev_num_frames_decoded > 0:
                    need_finalize = True
                    resend_previous_waveform = True
                    print("prev_num_frames_decoded:", prev_num_frames_decoded)

                if need_endpoint_finalize and prev_num_frames_decoded == 0:
                    print(
                        "WARN need_endpoint_finalize and prev_num_frames_decoded == 0"
                    )

            # Finalize the decoding here, if endpointing signalized that we should start a new utterance.
            # We might also need to finalize if we switch from do_decode=True to do_decode=False (user starts/stops decoding from frontend).
            if need_finalize and block is not None and prev_num_frames_decoded > 0:
                print("prev_num_frames_decoded:", prev_num_frames_decoded)
                out, confd = finalize_decode(asr, asr_client, key, part,
                                             speaker, utt)
                feat_pipeline, sil_weighting = reinitialize_asr(
                    adaptation_state, asr, feat_info, feat_pipeline)
                utt += 1
                part = 1

                if resend_previous_waveform and previous_block is not None:
                    # We always resend the last block for the new utterance (we only know that the endpoint is inside of a chunk, but not where exactly)
                    feat_pipeline.accept_waveform(samp_freq,
                                                  Vector(previous_block))
                    resend_previous_waveform = False

                need_finalize = False

                prev_num_frames_decoded = 0

            # If we operate on multichannel data, select the channel here that has the highest volume
            # (with some added heuristic, only change the speaker if the previous speaker was active for minimum_num_frames_decoded_per_speaker many frames)
            if channels > 1:
                block = np.reshape(block, (-1, channels))

                # Select loudest channel
                volume_norms = []
                for i in range(channels):
                    # We have a simplyfied concept of loudness, it is simply the L2 of the chunk interpreted as a vector (sqrt of the sum of squares):
                    # This has nothing to do with the physical loudness.

                    volume_norms.append(
                        np.linalg.norm(block[:, i] / 65536.0) * 10.0)
                    #print("|" * int(volume_norm))

                #print('vols:', volume_norms)

                volume_norms = [
                    0.0 if elem < mic_vol_cutoff else elem
                    for elem in volume_norms
                ]

                volume_norm = max(volume_norms)
                max_channel = volume_norms.index(volume_norm)
                block = block[:, max_channel]

                new_speaker = speaker_str.replace("#c#", str(max_channel))

                #print('vols:',volume_norms, 'max:',max_channel, 'value:',volume_norm)

                if sum(volume_norms) > 1e-10 and new_speaker != speaker \
                        and prev_num_frames_decoded >= minimum_num_frames_decoded_per_speaker:
                    print(
                        "Speaker change! Number of frames decoded for previous speaker:",
                        str(prev_num_frames_decoded))

                    speaker = new_speaker

                    need_finalize = True
                    resend_previous_waveform = True

                    #prev_num_frames_decoded = 0
            else:
                volume_norm = np.linalg.norm(block / 65536.0) * 10.0

            num_chunks += 1

            # Send status beacon periodically (to frontend, so its knows we are alive)
            if num_chunks % 50 == 0:
                asr_client.sendstatus(isDecoding=do_decode)

            if do_decode:
                # If we use the unthreaded mode, we block until the computation here in this loop
                if not use_threads:
                    need_endpoint_finalize, prev_num_frames_decoded, part, utt = advance_mic_decoding(
                        adaptation_state, asr, asr_client, block,
                        chunks_decoded, feat_info, feat_pipeline, key,
                        last_chunk, part, prev_num_frames_decoded, samp_freq,
                        sil_weighting, speaker, utt)
                    # Check if we need to finalize, disallow endpoint without a single decoded frame
                    if need_endpoint_finalize and prev_num_frames_decoded > 0:
                        need_finalize = True
                        resend_previous_waveform = True
                        print("prev_num_frames_decoded:",
                              prev_num_frames_decoded)

                else:
                    # In threaded mode, we submit a non blocking computation request to the thread executor
                    decode_future = executor.submit(
                        advance_mic_decoding, adaptation_state, asr,
                        asr_client, block, chunks_decoded, feat_info,
                        feat_pipeline, key, last_chunk, part,
                        prev_num_frames_decoded, samp_freq, sil_weighting,
                        speaker, utt)
            else:
                time.sleep(0.001)

            previous_block = block

    # Record message history as an integrated Python file, that can be used as a standalone replay
    if record_message_history:
        with open('message_history_replay.py', 'w') as message_history_out:
            message_history_out.write(asr_client.message_trace)
    else:
        print(
            "Not writing record message history since --record_message_history is not set."
        )

    # Write debug wav as output file (will only be executed after shutdown)
    if save_debug_wav:
        print("Saving debug output...")
        wavefile.write("debug.wav", samp_freq, np.concatenate(blocks,
                                                              axis=None))
        wavefile.write("debugraw.wav", record_samplerate,
                       np.concatenate(rawblocks, axis=None))
    else:
        print(
            "Not writing debug wav output since --save_debug_wav is not set.")

    # Now shuting down pipeline, compute MBR for the final utterance and complete it.
    print("Shutdown: finalizing ASR output...")
    asr.finalize_decoding()
    out = asr.get_output()
    mbr = MinimumBayesRisk(out["lattice"])
    confd = mbr.get_one_best_confidences()
    print(out)
    # print(key + "-utt%d-final" % utt, out["text"], flush=True)
    if asr_client is not None:
        asr_client.completeUtterance(utterance=out["text"],
                                     key=key + "-utt%d-part%d" % (utt, part),
                                     confidences=confd,
                                     speaker=speaker)
        asr_client.sendstatus(isDecoding=False, shutdown=True)
    print("Done, will exit now.")

Exemple #24

0

Afficher le fichier

def compute_vad(wav_rspecifier, feats_wspecifier, opts):
    """This function computes the vad based on ltsv features.

    The output is written in the file denoted by feats_wspecifier,
    and if the test_plot flag is set, it produces a plot.

    Args:
        wav_rspecifier: Kaldi specifier for reading wav files.
        feats_wspecifier:  Kaldi wpscifier for writing feature files.
        opts: Options. See main function for list of options

    Returns:
        True if computation was successful for at least one file.
        False otherwise.
    """

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
         VectorWriter(feats_wspecifier) as writer:

        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print(
                    "File: {} is too short ({} sec): "
                    "producing no output.".format(key, wave.duration),
                    file=sys.stderr,
                )
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr,
                )
                continue

            channel = 0 if opts.channel == -1 else opts.channel

            fr_length_samples = int(opts.frame_window * wave.samp_freq *
                                    (10**(-3)))
            fr_shift_samples = int(opts.frame_shift * wave.samp_freq *
                                   (10**(-3)))

            assert opts.nfft >= fr_length_samples

            wav_data = np.squeeze(wave.data()[channel].numpy())

            sample_freqs, segment_times, spec = signal.spectrogram(
                wav_data,
                fs=wave.samp_freq,
                nperseg=fr_length_samples,
                nfft=opts.nfft,
                noverlap=fr_length_samples - fr_shift_samples,
                scaling="spectrum",
                mode="psd",
            )

            specT = np.transpose(spec)

            spect_n = ARMA.ApplyARMA(specT, opts.arma_order)

            ltsv_f = LTSV.ApplyLTSV(
                spect_n,
                opts.ltsv_ctx_window,
                opts.threshold,
                opts.slope,
                opts.sigmoid_scale,
            )

            vad_feat = DCTF.ApplyDCT(opts.dct_num_cep, opts.dct_ctx_window,
                                     ltsv_f)

            if opts.test_plot:
                show_plot(
                    key,
                    segment_times,
                    sample_freqs,
                    spec,
                    wave.duration,
                    wav_data,
                    vad_feat,
                )

            writer[key] = Vector(vad_feat)
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print(
        "Done {} out of {} utterances".format(num_success, num_utts),
        file=sys.stderr,
    )

    return num_success != 0

Exemple #25

0

Afficher le fichier

Fichier : full-gmm-test.py Projet : yongxuUSTC/pykaldi

    def testFullGmm(self):
        dim = 1 + np.random.randint(low=0, high=9)
        nMix = 1 + np.random.randint(low=0, high=9)

        print("Testing NumGauss: {}, Dim: {}".format(nMix, dim))

        feat = Vector([kaldi_math.rand_gauss() for _ in range(dim)])
        weights = Vector([kaldi_math.rand_uniform() for _ in range(nMix)])
        tot_weigth = weights.sum()

        for i, m in enumerate(weights):
            weights[i] = m / tot_weigth

        means = Matrix([[kaldi_math.rand_gauss() for _ in range(dim)]
                        for _ in range(nMix)])

        invcovars = [SpMatrix(dim) for _ in range(nMix)]
        covars_logdet = []
        for _ in range(nMix):
            c, matrix_sqrt, logdet_out = RandPosdefSpMatrix(dim)
            invcovars[_].copy_from_sp_(c)
            invcovars[_].invert_double_()
            covars_logdet.append(logdet_out)

        # Calculate loglike for feature Vector
        def auxLogLike(w, logdet, mean_row, invcovar):
            return -0.5 * ( kaldi_math.M_LOG_2PI * dim \
                          + logdet \
                          + vec_mat_vec(mean_row, invcovar, mean_row) \
                          + vec_mat_vec(feat, invcovar, feat)) \
                    + vec_mat_vec(mean_row, invcovar, feat) \
                    + np.log(w)

        loglikes = [
            auxLogLike(weights[m], covars_logdet[m], means[m, :], invcovars[m])
            for m in range(nMix)
        ]
        loglike = Vector(loglikes).log_sum_exp()

        # new Gmm
        gmm = FullGmm(nMix, dim)
        gmm.set_weights(weights)
        gmm.set_inv_covars_and_means(invcovars, means)
        gmm.compute_gconsts()

        loglike1, posterior1 = gmm.component_posteriors(feat)

        self.assertAlmostEqual(loglike, loglike1, delta=0.01)
        self.assertAlmostEqual(1.0, posterior1.sum(), delta=0.01)

        weights_bak = gmm.weights()
        means_bak = gmm.means()
        invcovars_bak = gmm.covars()
        for i in range(nMix):
            invcovars_bak[i].invert_double_()

        # Set all params one-by-one to new model
        gmm2 = FullGmm(gmm.num_gauss(), gmm.dim())
        gmm2.set_weights(weights_bak)
        gmm2.set_means(means_bak)
        gmm2.inv_covars_ = invcovars_bak
        gmm2.compute_gconsts()

        loglike_gmm2 = gmm2.log_likelihood(feat)
        self.assertAlmostEqual(loglike1, loglike_gmm2, delta=0.01)

        loglikes = gmm2.log_likelihoods(feat)
        self.assertAlmostEqual(loglikes.log_sum_exp(), loglike_gmm2)

        indices = list(range(gmm2.num_gauss()))
        loglikes = gmm2.log_likelihoods_preselect(feat, indices)
        self.assertAlmostEqual(loglikes.log_sum_exp(), loglike_gmm2)

        # Simple component mean accessor + mutator
        gmm3 = FullGmm(gmm.num_gauss(), gmm.dim())
        gmm3.set_weights(weights_bak)
        means_bak.set_zero_()
        for i in range(nMix):
            gmm.get_component_mean(i, means_bak[i, :])
        gmm3.set_means(means_bak)
        gmm3.inv_covars_ = invcovars_bak
        gmm3.compute_gconsts()

        loglike_gmm3 = gmm3.log_likelihood(feat)
        self.assertAlmostEqual(loglike1, loglike_gmm3, delta=0.01)

        gmm4 = FullGmm(gmm.num_gauss(), gmm.dim())
        gmm4.set_weights(weights_bak)
        invcovars_bak, means_bak = gmm.get_covars_and_means()
        for i in range(nMix):
            invcovars_bak[i].invert_double_()
        gmm4.set_inv_covars_and_means(invcovars_bak, means_bak)
        gmm4.compute_gconsts()
        loglike_gmm4 = gmm4.log_likelihood(feat)
        self.assertAlmostEqual(loglike1, loglike_gmm4, delta=0.01)

        # TODO: I/O tests

        # CopyFromFullGmm
        gmm4 = FullGmm()
        gmm4.copy_from_full(gmm)
        loglike5, _ = gmm4.component_posteriors(feat)
        self.assertAlmostEqual(loglike, loglike5, delta=0.01)

        # CopyFromDiag
        gmm_diag = DiagGmm(nMix, dim)
        init_rand_diag_gmm(gmm_diag)
        loglike_diag = gmm_diag.log_likelihood(feat)

        gmm_full = FullGmm().copy(gmm_diag)
        loglike_full = gmm_full.log_likelihood(feat)

        gmm_diag2 = DiagGmm().copy(gmm_full)
        loglike_diag2 = gmm_diag2.log_likelihood(feat)

        self.assertAlmostEqual(loglike_diag, loglike_full, delta=0.01)
        self.assertAlmostEqual(loglike_diag, loglike_diag2, delta=0.01)

Exemple #26

0

Afficher le fichier

def compute_vad(wav_rspecifier, feats_wspecifier, opts):
    """This function computes the vad based on ltsv features.
  The output is written in the file denoted by feats_wspecifier,
  and if the test_plot flaf is set, it produces a plot.

  Args:
      wav_rspecifier: An ark or scp file as in Kaldi, that contains the input audio
      feats_wspecifier:  An ark or scp file as in Kaldi, that contains the input audio
      opts: Options. See main function for list of options
 
  Returns:
      The number of successful trials.
  """

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
           VectorWriter(feats_wspecifier) as writer:

        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print("File: {} is too short ({} sec): producing no output.".
                      format(key, wave.duration),
                      file=sys.stderr)
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr)
                continue
            channel = 0 if opts.channel == -1 else opts.channel

            fr_length_samples = int(opts.frame_window * wave.samp_freq *
                                    (10**(-3)))
            fr_shift_samples = int(opts.frame_shift * wave.samp_freq *
                                   (10**(-3)))

            try:

                wav_data = np.squeeze(wave.data()[channel].numpy())

                sample_freqs, segment_times, spec = signal.spectrogram(
                    wav_data,
                    fs=wave.samp_freq,
                    nperseg=fr_length_samples,
                    nfft=opts.nfft,
                    noverlap=fr_length_samples - fr_shift_samples,
                    scaling='spectrum',
                    mode='psd')

                specT = np.transpose(spec)

                spect_n = ARMA.ApplyARMA(specT, opts.arma_order)

                ltsv_f = LTSV.ApplyLTSV(spect_n, opts.ltsv_ctx_window,
                                        opts.threshold, opts.slope,
                                        opts.sigmoid_scale)

                vad_feat = DCTF.ApplyDCT(opts.dct_num_cep, opts.dct_ctx_window,
                                         ltsv_f)

                feats = Vector(vad_feat)

                if opts.test_plot:
                    show_plot(segment_times, sample_freqs, spec, wave,
                              wav_data, vad_feat)

            except:
                print("Failed to compute features for utterance",
                      key,
                      file=sys.stderr)
                continue

            writer[key] = feats
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    return num_success != 0

Exemple #27

0

Afficher le fichier

Fichier : test-writers.py Projet : ruchirtravadi/pykaldi

 def getExampleObj(self):
     return Vector([1, 2, 3, 4, 5])

Exemple #28

0

Afficher le fichier

            mrk_fn = line.split()[0]
            seq_fn = line.split()[1]
            with open(mrk_fn, 'r', encoding='utf-8') as mrk, \
                 open(seq_fn, 'rb') as seq:
                for mrk_line in mrk:
                    seq.seek(int(mrk_line.split()[1]))
                    num_bytes = int(mrk_line.split()[2])
                    #this is making sure even number of bytes
                    num_bytes -= num_bytes % 2
                    audio_bytes = seq.read(num_bytes)
                    audio_np = np.frombuffer(audio_bytes, dtype='int16')
                    audio_seg = AudioSegment(audio_np, args.sample_rate)
                    spr = speed_rate[randint(0, len(speed_rate) - 1)]
                    audio_seg.change_speed(spr)
                    #-55 to -10 db
                    audio_seg.normalize(np.random.uniform(-55, -10))
                    audio_np = audio_seg._convert_samples_from_float32(\
                                         audio_seg.samples, 'int16')
                    wave_1ch = Vector(audio_np)
                    feats = fbank.compute_features(wave_1ch,
                                                   args.sample_rate,
                                                   vtnl_warp=1.0)
                    if args.cmn:
                        feats = _matrix_ext.matrix_to_numpy(feats)
                        feats -= np.mean(feats, axis=0)
                        feats = Matrix(feats)

                    cmvn.accumulate(feats)

    cmvn.write_stats(args.cmvn_stats, binary=False)

Exemple #29

0

Afficher le fichier

def otf_utt_generator(data_triplets, rir, noise, args):
    """
    Args:
        data_lst: list of mrk and seq of input audios, and label ark
        rir: list of rir, List[AudioSegment]
        noise: list of noise, List[AudioSegment]
        args: argumnets for loader
    """
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)

    batch_idx = 0
    valid_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1

    #rates for speed perturbation
    speed_rate = [float(rate) for rate in args.speed_rate.split(',')]
    #volume level perturbation
    gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')]
    #snr range for noise perturbation: 0-20db with mean of 10
    #mu, sigma = 10, 10
    #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma
    #Fbank config
    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    #fbank_opt = MfccOptions()
    #fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    #fbank = Mfcc(fbank_opt)

    for data_triplet in data_triplets:
        mrk_fn, seq_fn = data_triplet[0], data_triplet[1]
        ali_rspec = data_triplet[2]
        with open(mrk_fn, 'r', encoding='utf-8') as mrk,\
             open(seq_fn, 'rb') as seq:
            ali_reader = SequentialIntVectorReader(ali_rspec)
            for line, (uttid1, ali) in zip(mrk, ali_reader):
                uttid = line.split()[0]
                assert uttid == uttid1
                seq.seek(int(line.split()[1]))
                num_bytes = int(line.split()[2])
                num_bytes -= num_bytes % 2
                audio_bytes = seq.read(num_bytes)
                audio_np = np.frombuffer(audio_bytes, dtype='int16')
                #data augmentation function goes here
                audio_seg = AudioSegment(audio_np, args.sample_rate)
                #speed perturbation
                spr = speed_rate[randint(0, len(speed_rate) - 1)]
                audio_seg.change_speed(spr)
                audio_seg.normalize(np.random.uniform(gain_lo, gain_hi))
                #noise adding example:
                #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1)
                #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr)
                #rir adding example:
                #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)])
                audio_np = audio_seg._convert_samples_from_float32(\
                                     audio_seg.samples, 'int16')
                wave_1ch = Vector(audio_np)
                feats = fbank.compute_features(wave_1ch,
                                               args.sample_rate,
                                               vtnl_warp=1.0)
                ali = np.array(ali)
                if args.reverse_labels:
                    ali = ali[::-1]
                if args.SOS >= 0:
                    ali = np.concatenate(([args.SOS], ali))
                if args.EOS >= 0:
                    ali = np.concatenate((ali, [args.EOS]))
                feats = _matrix_ext.matrix_to_numpy(feats)
                utt_len = feats.shape[0] // args.stride + \
                          int(feats.shape[0] % args.stride != 0)
                #limits on T*U products due to RNNT.
                #this is pretty hacky now
                if ali.shape[0] * utt_len // 3 <= args.TU_limit:
                    ali_len[valid_idx] = ali.shape[0]
                    data_buffer[valid_idx, :utt_len, :] = \
                        splice(feats, args.lctx, args.rctx)[::args.stride]
                    target_buffer[valid_idx, :ali_len[valid_idx]] = ali
                    len_buffer[valid_idx] = utt_len
                    if utt_len > batch_max_len:
                        batch_max_len = utt_len
                    if ali_len[valid_idx] > target_max_len:
                        target_max_len = ali_len[valid_idx]
                    valid_idx += 1

                batch_idx += 1

                if batch_idx == batch_size:
                    for b in range(valid_idx):
                        utt_len = len_buffer[b]
                        target_len = ali_len[b]
                        #data and target padding
                        if utt_len > 0:
                            data_buffer[b, utt_len:batch_max_len, :] = \
                                data_buffer[b, utt_len-1, :]
                            target_buffer[b, target_len:target_max_len] = \
                                args.padding_tgt

                    data = data_buffer[:valid_idx, :batch_max_len, :]
                    target = target_buffer[:valid_idx, :target_max_len]

                    if not args.batch_first:
                        data = np.transpose(data, (1, 0, 2))
                        target = np.transpose(target, (1, 0))

                    data = torch.from_numpy(np.copy(data))
                    target = torch.from_numpy(np.copy(target))
                    lens = torch.from_numpy(np.copy(len_buffer[:valid_idx]))
                    ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx]))

                    if valid_idx > 0:
                        #not doing cuda() here, in main process instead
                        yield data, target, lens, ali_lens
                    else:
                        yield None, None, \
                              torch.IntTensor([0]), torch.IntTensor([0])

                    batch_idx = 0
                    valid_idx = 0
                    target_len = 0
                    batch_max_len = -1
                    target_max_len = -1

            ali_reader.close()

    yield None

Exemple #30

0

Afficher le fichier

def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts):
    mfcc = Mfcc(mfcc_opts)

    if opts.vtln_map:
        vtln_map_reader = RandomAccessFloatReaderMapped(
            opts.vtln_map, opts.utt2spk)
    elif opts.utt2spk:
        print("utt2spk option is needed only if vtln-map option is specified.",
              file=sys.stderr)

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
         MatrixWriter(feats_wspecifier) as writer:
        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print("File: {} is too short ({} sec): producing no output.".
                      format(key, wave.duration),
                      file=sys.stderr)
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr)
                continue
            channel = 0 if opts.channel == -1 else opts.channel

            if opts.vtln_map:
                if key not in vtln_map_reader:
                    print("No vtln-map entry for utterance-id (or speaker-id)",
                          key,
                          file=sys.stderr)
                    continue
                vtln_warp = vtln_map_reader[key]
            else:
                vtln_warp = opts.vtln_warp

            try:
                feats = mfcc.compute_features(wave.data()[channel],
                                              wave.samp_freq, vtln_warp)
            except:
                print("Failed to compute features for utterance",
                      key,
                      file=sys.stderr)
                continue

            if opts.subtract_mean:
                mean = Vector(feats.num_cols)
                mean.add_row_sum_mat_(1.0, feats)
                mean.scale_(1.0 / feats.num_rows)
                for i in range(feats.num_rows):
                    feats[i].add_vec_(-1.0, mean)

            writer[key] = feats
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    if opts.vtln_map:
        vtln_map_reader.close()

    return num_success != 0