コード例 #1
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
            file_writer_helper(args.wspecifier,
                               filetype=args.filetype,
                               write_num_frames=args.write_num_frames,
                               compress=args.compress,
                               compression_method=args.compression_method
                               ) as writer:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))

            lmspc = logmelspectrogram(x=array,
                                      fs=args.fs,
                                      n_mels=args.n_mels,
                                      n_fft=args.n_fft,
                                      n_shift=args.n_shift,
                                      win_length=args.win_length,
                                      window=args.window,
                                      fmin=args.fmin,
                                      fmax=args.fmax)
            writer[utt_id] = lmspc
コード例 #2
0
ファイル: test_transform.py プロジェクト: siddalmia/espnet
def test_preprocessing(tmpdir):
    cmvn_ark = str(tmpdir.join("cmvn.ark"))
    kwargs = {
        "process": [
            {
                "type": "fbank",
                "n_mels": 80,
                "fs": 16000,
                "n_fft": 1024,
                "n_shift": 512
            },
            {
                "type": "cmvn",
                "stats": cmvn_ark,
                "norm_vars": True
            },
            {
                "type": "delta",
                "window": 2,
                "order": 2
            },
        ],
        "mode":
        "sequential",
    }

    # Creates cmvn_ark
    samples = np.random.randn(100, 80)
    stats = np.empty((2, 81), dtype=np.float32)
    stats[0, :80] = samples.sum(axis=0)
    stats[1, :80] = (samples**2).sum(axis=0)
    stats[0, -1] = 100.0
    stats[1, -1] = 0.0
    kaldiio.save_mat(cmvn_ark, stats)

    bs = 1
    xs = [np.random.randn(1000).astype(np.float32) for _ in range(bs)]
    preprocessing = Transformation(kwargs)
    processed_xs = preprocessing(xs)

    for idx, x in enumerate(xs):
        opt = dict(kwargs["process"][0])
        opt.pop("type")
        x = logmelspectrogram(x, **opt)

        opt = dict(kwargs["process"][1])
        opt.pop("type")
        x = CMVN(**opt)(x)

        opt = dict(kwargs["process"][2])
        opt.pop("type")
        x = add_deltas(x, **opt)

        np.testing.assert_allclose(processed_xs[idx], x)
コード例 #3
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    # Find the number of utterances
    n_utt = sum(1 for line in open(args.segments))
    logging.info("%d utterances found to be processed." % n_utt)

    # Compute fbank features
    with kaldiio.ReadHelper(
            args.rspecifier,
            segments=args.segments) as reader, file_writer_helper(
                args.wspecifier,
                filetype=args.filetype,
                write_num_frames=args.write_num_frames,
                compress=args.compress,
                compression_method=args.compression_method,
            ) as writer:
        for i, struct in enumerate(reader, start=1):
            logging.info("processing %d/%d(%.2f%%)" %
                         (i, n_utt, 100 * i / n_utt))
            utt_id, (rate, array) = struct
            try:
                assert rate == args.fs
                array = array.astype(numpy.float32)
                if args.normalize is not None and args.normalize != 1:
                    array = array / (1 << (args.normalize - 1))

                lmspc = logmelspectrogram(
                    x=array,
                    fs=args.fs,
                    n_mels=args.n_mels,
                    n_fft=args.n_fft,
                    n_shift=args.n_shift,
                    win_length=args.win_length,
                    window=args.window,
                    fmin=args.fmin,
                    fmax=args.fmax,
                )
                writer[utt_id] = lmspc
            except:
                logging.warning("failed to compute fbank for utt_id=`%s`" %
                                utt_id)
コード例 #4
0
def test_compatible_with_espnet1():
    layer = LogMelFbank(n_fft=16,
                        hop_length=4,
                        n_mels=4,
                        fs="16k",
                        fmin=80,
                        fmax=7600)
    x = torch.randn(1, 100)
    y, _ = layer(x, torch.LongTensor([100]))
    y = y.numpy()[0]
    y2 = logmelspectrogram(x[0].numpy(),
                           n_fft=16,
                           n_shift=4,
                           n_mels=4,
                           fs=16000,
                           fmin=80,
                           fmax=7600)
    np.testing.assert_allclose(y, y2, rtol=0, atol=1e-5)
コード例 #5
0
ファイル: preprocess.py プロジェクト: Wendison/FCL-taco2
def acoustic_features_process_one_utterance(wav_path, args, utt2dur_phn):
    uttid = os.path.basename(wav_path).split('.')[0]
    # extract mel-spectrogram (log)
    wav, fs = sf.read(wav_path)
    peak = np.abs(wav).max()
    if peak > 1.0:
        wav /= peak
    if fs != args.set_fs:
        wav = resampy.resample(wav, fs, args.set_fs, axis=0)
        fs = args.set_fs
    mel = logmelspectrogram(
        x=wav,
        fs=fs,
        n_mels=args.n_mels,
        n_fft=args.n_fft,
        n_shift=args.n_shifts,
        win_length=args.win_length,
        window=args.windows,
        fmin=args.fmin,
        fmax=args.fmax,
    )

    # make sum(dur) = mel length & save durations
    durations = utt2dur_phn[uttid][0]
    durations[-1] += mel.shape[0] - sum(durations)
    durations = np.array(durations, dtype=float).reshape(-1, 1)
    dur_save_root = f'{args.feature_root}/durations_MFA'
    os.makedirs(dur_save_root, exist_ok=True)
    dur_save_path = f'{dur_save_root}/{uttid}.npy'
    np.save(dur_save_path, durations)

    # extract phn-level F0 & energy
    tlen = mel.shape[0]
    frame_period = args.n_shifts / fs * 1000
    f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period)
    f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs)
    f0 = f0[:tlen].reshape(-1).astype('float32')
    nonzeros_indices = np.nonzero(f0)
    lf0 = f0.copy()
    lf0[nonzeros_indices] = np.log(
        f0[nonzeros_indices])  # for f0(Hz), lf0 > 0 when f0 != 0

    x_mag = np.abs(
        stft(wav,
             args.n_fft,
             args.n_shifts,
             win_length=args.win_length,
             window=args.windows))  # T x F
    energy = np.linalg.norm(x_mag, axis=1).reshape(-1)
    assert len(energy) == tlen

    durs = durations.reshape(-1)
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    pitch_phn = np.zeros((durs.shape[0], ), dtype=np.float)
    energy_phn = np.zeros((durs.shape[0], ), dtype=np.float)
    for idx, a, b in zip(range(durs.shape[0]), durs_cum[:-1], durs_cum[1:]):
        a = int(a)
        b = int(b)
        values = lf0[a:b][np.where(
            f0[a:b] != 0.0)[0]]  # use avg-lf0 instead of avg-f0
        pitch_phn[idx] = np.mean(values) if len(values) > 0 else 0.0
        values = energy[a:b]
        energy_phn[idx] = np.mean(values) if len(values) > 0 else 0.0

    f0 = pitch_phn
    energy = energy_phn

    mel_save_path = f'{args.feature_root}/mels-ori/{uttid}.npy'
    f0_save_path = f'{args.feature_root}/f0-ori/{uttid}.npy'
    en_save_path = f'{args.feature_root}/en-ori/{uttid}.npy'
    os.makedirs(os.path.dirname(mel_save_path), exist_ok=True)
    os.makedirs(os.path.dirname(f0_save_path), exist_ok=True)
    os.makedirs(os.path.dirname(en_save_path), exist_ok=True)
    np.save(mel_save_path, mel)
    np.save(f0_save_path, f0)
    np.save(en_save_path, energy)

    return uttid, mel, f0, energy
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--fs', type=int, help='Sampling frequency')
    parser.add_argument('--fmax',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Maximum frequency')
    parser.add_argument('--fmin',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Minimum frequency')
    parser.add_argument('--n_mels',
                        type=int,
                        default=80,
                        help='Number of mel basis')
    parser.add_argument('--n_fft',
                        type=int,
                        default=1024,
                        help='FFT length in point')
    parser.add_argument('--n_shift',
                        type=int,
                        default=512,
                        help='Shift length in point')
    parser.add_argument('--win_length',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Analisys window length in point')
    parser.add_argument('--window',
                        type=str,
                        default='hann',
                        choices=['hann', 'hamming'],
                        help='Type of window')
    parser.add_argument('--write-num-frames',
                        type=str,
                        help='Specify wspecifer for utt2num_frames')
    parser.add_argument('--filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5'],
                        help='Specify the file format for output. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--compress',
                        type=strtobool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')
    parser.add_argument('--verbose',
                        '-V',
                        default=0,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--normalize',
                        choices=[1, 16, 24, 32],
                        type=int,
                        default=None,
                        help='Give the bit depth of the PCM, '
                        'then normalizes data to scale in [-1,1]')
    parser.add_argument('rspecifier', type=str, help='WAV scp file')
    parser.add_argument('--segments',
                        type=str,
                        help='segments-file format: each line is either'
                        '<segment-id> <recording-id> <start-time> <end-time>'
                        'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5')
    parser.add_argument('wspecifier', type=str, help='Write specifier')
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
            FileWriterWrapper(args.wspecifier,
                              filetype=args.filetype,
                              write_num_frames=args.write_num_frames,
                              compress=args.compress,
                              compression_method=args.compression_method
                              ) as writer:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))

            lmspc = logmelspectrogram(x=array,
                                      fs=args.fs,
                                      n_mels=args.n_mels,
                                      n_fft=args.n_fft,
                                      n_shift=args.n_shift,
                                      win_length=args.win_length,
                                      window=args.window,
                                      fmin=args.fmin,
                                      fmax=args.fmax)
            writer[utt_id] = lmspc