def compute_fbank_pitch():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['thres_autoc'] = args.thres_autoc
    config['output_type'] = args.output_type

    fbank_pitch = FbankPitch.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            fbank_pitch_test = fbank_pitch(audio_data, args.sample_rate)
            sess = tf.Session()
            fbank_pitch_feats = fbank_pitch_test.eval(session=sess)
            writer[utt_id] = fbank_pitch_feats
Exemple #2
0
def compute_spectrum():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['output_type'] = int(args.output_type)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length

    spectrum = Spectrum.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            spectrum_test = spectrum(audio_data, args.sample_rate)
            sess = tf.compat.v1.Session()
            spectrum_feats = spectrum_test.eval(session=sess)
            writer[utt_id] = spectrum_feats
def compute_mfcc():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['output_type'] = args.output_type
    config['window_type'] = args.window_type
    config['snip_edges'] = args.snip_edges
    config['preeph_coeff'] = args.preeph_coeff
    config['remove_dc_offset'] = args.remove_dc_offset
    config['is_fbank'] = args.is_fbank
    config['cepstral_lifter'] = args.cepstral_lifter
    config['coefficient_count'] = args.coefficient_count

    mfcc = Mfcc.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate))
            sess = tf.Session()
            mfcc_feats = mfcc_test.eval(session=sess)
            writer[utt_id] = mfcc_feats
def compute_stft():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length

    stft = Analyfiltbank.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            power_spectrum, phase_spectrum = stft(audio_data, args.sample_rate)
            sess = tf.Session()
            if args.output_type == 1:
                out_feats = power_spectrum.eval(session=sess)
            else:
                out_feats = phase_spectrum.eval(session=sess)
            writer[utt_id] = out_feats
Exemple #5
0
def apply_cmvn():
    args = get_parser().parse_args()

    if ':' in args.stats_rspecifier_or_rxfilename:
        is_rspcifier = True
        stats_filetype = 'ark'
        stats_dict = dict(KaldiReader(args.stats_rspecifier_or_rxfilename))
    else:
        is_rspcifier = False
        stats_filetype = 'mat'
        stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
        stats_dict = {None: stats}

    config = {}
    config['norm_means'] = args.norm_means
    config['norm_vars'] = args.norm_vars
    config['utt2spk'] = args.utt2spk
    config['spk2utt'] = args.spk2utt
    config['reverse'] = args.reverse
    config['std_floor'] = args.std_floor
    config['filetype'] = stats_filetype

    cmvn = CMVN.params(config).instantiate()
    cmvn.call(stats_dict)

    with KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                  compress=args.compress, compression_method=args.compression_method) as writer, \
      kaldiio.ReadHelper(args.rspecifier) as reader:
        for utt, mat in reader:
            mat_new = cmvn.apply_cmvn(mat, utt)
            writer[utt] = mat_new
def compute_pitch():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['snip_edges'] = args.snip_edges
    config['preemph_coeff'] = args.preemph_coeff
    config['min_f0'] = args.min_f0
    config['max_f0'] = args.max_f0
    config['soft_min_f0'] = args.soft_min_f0
    config['penalty_factor'] = args.penalty_factor
    config['lowpass_cutoff'] = args.lowpass_cutoff
    config['resample_freq'] = args.resample_freq
    config['delta_pitch'] = args.delta_pitch
    config['nccf_ballast'] = args.nccf_ballast
    config['lowpass_filter_width'] = args.lowpass_filter_width
    config['upsample_filter_width'] = args.upsample_filter_width
    config['max_frames_latency'] = args.max_frames_latency
    config['frames_per_chunk'] = args.frames_per_chunk
    config['simulate_first_pass_online'] = args.simulate_first_pass_online
    config['recompute_frame'] = args.recompute_frame
    config['nccf_ballast_online'] = args.nccf_ballast_online

    pitch = Pitch.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate))
            sess = tf.Session()
            pitch_feats = pitch_test.eval(session=sess)
            writer[utt_id] = pitch_feats
Exemple #7
0
def compute_cmvn_stats():
    """
  e.g. compute_cmvn_stats.py scp:data/train/feats.scp data/train/cmvn.ark # compute global cmvn
  """
    args = get_parser().parse_args()

    is_wspecifier = ':' in args.wspecifier_or_wxfilename

    if is_wspecifier:
        if args.spk2utt is not None:
            utt2spk_dict = {}
            with open(args.spk2utt) as f:
                for line in f:
                    spk, utts = line.rstrip().split(None, 1)
                    for utt in utts.split():
                        utt2spk_dict[utt] = spk

            def utt2spk(x):
                return utt2spk_dict[x]
        else:
            logging.info('Performing as utterance CMVN mode')

            def utt2spk(x):
                return x

    else:
        logging.info('Performing as gloabl CMVN model')
        if args.spk2utt is not None:
            logging.warning('spk2utt is not used for global CMVN mode')

        def utt2spk(x):
            return None

    # Calculate stats for each speaker
    counts = {}
    sum_feats = {}
    square_sum_feats = {}

    idx = 0
    for idx, (utt, matrix) in enumerate(KaldiReader(args.rspecifier), 1):
        spk = utt2spk(utt)

        if spk not in counts:
            counts[spk] = 0
            feat_shape = matrix.shape[1:]
            sum_feats[spk] = np.zeros(feat_shape, dtype=np.float)
            square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float)

        counts[spk] += matrix.shape[0]
        sum_feats[spk] += matrix.sum(axis=0)
        square_sum_feats[spk] += (matrix**2).sum(axis=0)

    assert idx > 0, idx

    cmvn_stats = {}
    for spk in counts:
        feat_shape = sum_feats[spk].shape
        cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:]
        _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64)
        _cmvn_stats[0, :-1] = sum_feats[spk]
        _cmvn_stats[1, :-1] = square_sum_feats[spk]

        _cmvn_stats[0, -1] = counts[spk]
        _cmvn_stats[1, -1] = 0.

        cmvn_stats[spk] = _cmvn_stats

    if is_wspecifier:
        with KaldiWriter(args.wspecifier_or_wxfilename) as writer:
            for spk, mat in cmvn_stats.items():
                writer[spk] = mat
    else:
        matrix = cmvn_stats[None]
        kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)