Ejemplo n.º 1
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
Ejemplo n.º 2
0
def compute_mfcc():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['output_type'] = args.output_type
    config['window_type'] = args.window_type
    config['snip_edges'] = args.snip_edges
    config['preeph_coeff'] = args.preeph_coeff
    config['remove_dc_offset'] = args.remove_dc_offset
    config['is_fbank'] = args.is_fbank
    config['cepstral_lifter'] = args.cepstral_lifter
    config['coefficient_count'] = args.coefficient_count

    mfcc = Mfcc.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate))
            sess = tf.Session()
            mfcc_feats = mfcc_test.eval(session=sess)
            writer[utt_id] = mfcc_feats
Ejemplo n.º 3
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power spectrum or log power spectrum of audio data.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return: A float tensor of size N containing add-noise audio.
        """

    p = self.config
    with tf.name_scope('add_rir_noise_aecres'):
      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):
        sample_rate = tf.cast(sample_rate, dtype=float)
        add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres(
            audio_data,
            sample_rate,
            if_add_rir=p.if_add_rir,
            rir_filelist=p.rir_filelist,
            if_add_noise=p.if_add_noise,
            snr_min=p.snr_min,
            snr_max=p.snr_max,
            noise_filelist=p.noise_filelist,
            if_add_aecres=p.if_add_aecres,
            aecres_filelist=p.aecres_filelist)

        return tf.squeeze(add_rir_noise_aecres_out)
Ejemplo n.º 4
0
def compute_fbank():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['output_type'] = args.output_type

    fbank = Fbank.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            fbank_test = tf.squeeze(fbank(audio_data, args.sample_rate))
            sess = tf.compat.v1.Session()
            fbank_feats = fbank_test.eval(session=sess)
            writer[utt_id] = fbank_feats
Ejemplo n.º 5
0
def split_one_doc_to_true_len_sens(doc_t, split_token, padding_token,
                                   max_doc_len, max_sen_len):
    """
  Split a document to sentences with true sentence lengths.
  doc_t: [doc_word_len]
  out_t: [max_doc_len, max_sen_len]
  """
    if len(doc_t.get_shape()) == 1:
        split_token_index = tf.squeeze(tf.where(tf.equal(doc_t, split_token)),
                                       axis=1)
        split_token_index.set_shape([None])
        split_len_part_1 = split_token_index[:1] + 1
        split_len_part_2 = split_token_index[1:] - split_token_index[:-1]
        split_lens = tf.concat([split_len_part_1, split_len_part_2], axis=0)
        split_lens = cut_or_padding(split_lens,
                                    max_doc_len,
                                    padding_token=padding_token)
        new_doc_len = tf.reduce_sum(split_lens)
        splited_sentences = tf.split(doc_t[:new_doc_len], split_lens)
        splited_sentences = [
            cut_or_padding(s, max_sen_len) for s in splited_sentences
        ]
        out_t = tf.stack(splited_sentences)
        padding_tokens = tf.multiply(tf.ones_like(out_t, dtype=tf.int32),
                                     padding_token)
        out_t = tf.where(tf.equal(out_t, split_token), padding_tokens, out_t)
        return out_t

    raise ValueError("doc_t should be a tensor with rank 1.")
Ejemplo n.º 6
0
  def call(self, audio_data, sample_rate=None):
    """
    Caculate power of every frame in speech.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with,
                        default is 16kHz.
    :return:A float tensor of size (1 * num_frames) containing power of every
            frame in speech.
    """

    p = self.config
    with tf.name_scope('framepow'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):

        sample_rate = tf.cast(sample_rate, dtype=float)
        framepow = py_x_ops.frame_pow(
            audio_data,
            sample_rate,
            snip_edges=p.snip_edges,
            remove_dc_offset=p.remove_dc_offset,
            window_length=p.window_length,
            frame_length=p.frame_length)

        return tf.squeeze(framepow)
Ejemplo n.º 7
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
Ejemplo n.º 8
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
Ejemplo n.º 9
0
def read_wav(wavfile, params):
    ''' samples of shape [nsample] '''
    contents = tf.read_file(wavfile)
    #pylint: disable=no-member
    waveforms = tf.audio.decode_wav(
        contents,
        desired_channels=params.audio_channels,
        #desired_samples=params.audio_sample_rate,
    )
    return tf.squeeze(waveforms.audio, axis=-1)
Ejemplo n.º 10
0
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0):
  '''
  ctc loss function
  psram: logits, (B, T, D)
  psram: input_length,  (B, 1), input length of encoder
  psram: labels, (B, T)
  psram: label_length,  (B, 1), label length for convert dense label to sparse
  returns: loss, scalar
  '''
  ilen = tf.cond(
      pred=tf.equal(tf.rank(input_length), 1),
      true_fn=lambda: input_length,
      false_fn=lambda: tf.squeeze(input_length),
  )
  ilen = tf.cast(ilen, tf.int32)

  olen = tf.cond(
      pred=tf.equal(tf.rank(label_length), 1),
      true_fn=lambda: label_length,
      false_fn=lambda: tf.squeeze(label_length))
  olen = tf.cast(olen, tf.int32)

  deps = [
      tf.assert_rank(labels, 2, name='label_rank_check'),
      tf.assert_rank(logits, 3, name='logits_rank_check'),
      tf.assert_rank(ilen, 1, name='src_len_rank_check'),  # input_length
      tf.assert_rank(olen, 1, name='tgt_len_rank_check'),  # output_length
  ]

  labels, logits = ctc_data_transform(labels, logits, blank_index)

  with tf.control_dependencies(deps):
    # (B, 1)
    # blank index is consistent with Espnet, zero
    batch_loss = tf.nn.ctc_loss(
        labels=labels,
        inputs=logits,
        sequence_length=ilen,
        time_major=False,
        preprocess_collapse_repeated=False,
        ctc_merge_repeated=True,
        ignore_longer_outputs_than_inputs=False)
  return batch_loss
Ejemplo n.º 11
0
def load_textline_dataset(paths, column_num):
    """Load raw data for text task."""
    ds = tf.data.TextLineDataset(paths)
    ds = ds.map(lambda x: tf.squeeze(
        tf.strings.split(x, sep="\t", result_type="RaggedTensor"), axis=0))
    ds = ds.filter(lambda line: tf.equal(tf.size(line), column_num))
    ds_list = []
    for i in range(column_num):
        ds_list.append(ds.map(lambda x: x[i]))

    return tuple(ds_list)
Ejemplo n.º 12
0
 def call(self, wavfile):
     """
 Get audio data and sample rate from a wavfile.
 :param wavfile: filepath of wav
 :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav
     file, which is a tensor with float dtype.
 """
     p = self.config
     contents = tf.io.read_file(wavfile)
     audio_data, sample_rate = tf.audio.decode_wav(
         contents, desired_channels=p.audio_channels)
     assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                 tf.cast(sample_rate, dtype=float))
     with tf.control_dependencies([assert_op]):
         return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate,
                                                         dtype=float)
Ejemplo n.º 13
0
def read_wav(wavfile, params):
    ''' samples of shape [nsample] '''
    contents = tf.read_file(wavfile)
    #pylint: disable=no-member
    waveforms = tf.audio.decode_wav(
        contents,
        desired_channels=params.audio_channels,
        #desired_samples=params.audio_sample_rate,
    )
    #waveforms = tf.contrib.ffmpeg.decode_audio(
    #  contents,
    #  file_format='wav',
    #  samples_per_second = params.audio_sample_rate,
    #  channel_count=params.audio_channels,
    #)
    #return waveforms[:, 0]
    return tf.squeeze(waveforms.audio, axis=-1)
Ejemplo n.º 14
0
    def call(self, tensors):
        """Attention layer."""
        left, right = tensors

        len_left = left.shape[1]
        len_right = right.shape[1]
        tensor_left = tf.expand_dims(left, axis=2)
        tensor_right = tf.expand_dims(right, axis=1)
        tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1])
        tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1])
        tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1)
        middle_output = self.middle_layer(tensor_merged)
        attn_scores = self.attn(middle_output)
        attn_scores = tf.squeeze(attn_scores, axis=3)
        exp_attn_scores = tf.exp(
            attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True))
        exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True)
        attention_weights = exp_attn_scores / exp_sum
        return tf.matmul(attention_weights, right)
Ejemplo n.º 15
0
def compute_pitch():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['snip_edges'] = args.snip_edges
    config['preemph_coeff'] = args.preemph_coeff
    config['min_f0'] = args.min_f0
    config['max_f0'] = args.max_f0
    config['soft_min_f0'] = args.soft_min_f0
    config['penalty_factor'] = args.penalty_factor
    config['lowpass_cutoff'] = args.lowpass_cutoff
    config['resample_freq'] = args.resample_freq
    config['delta_pitch'] = args.delta_pitch
    config['nccf_ballast'] = args.nccf_ballast
    config['lowpass_filter_width'] = args.lowpass_filter_width
    config['upsample_filter_width'] = args.upsample_filter_width
    config['max_frames_latency'] = args.max_frames_latency
    config['frames_per_chunk'] = args.frames_per_chunk
    config['simulate_first_pass_online'] = args.simulate_first_pass_online
    config['recompute_frame'] = args.recompute_frame
    config['nccf_ballast_online'] = args.nccf_ballast_online

    pitch = Pitch.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate))
            sess = tf.Session()
            pitch_feats = pitch_test.eval(session=sess)
            writer[utt_id] = pitch_feats