Esempio n. 1
0
def main():
    #Load all data and labels
    with open('music_genres_dataset.pkl', 'rb') as f:
        train_set = pickle.load(f)
        test_set = pickle.load(f)

    train_set_data = train_set['data']
    train_set_labels = train_set['labels']
    train_set_id = train_set['track_id']

    test_set_data = test_set['data']
    test_set_labels = test_set['labels']
    test_set_id = test_set['track_id']

    train_mel = []
    #Convert audio data to spectrogram
    for i in range(np.shape(train_set_data)[0]):
        train_mel.append(melspectrogram(train_set_data[i][:]))
        print(i / 11250)
    test_mel = []
    for i in range(np.shape(test_set_data)[0]):
        test_mel.append(melspectrogram(test_set_data[i][:]))
        print(i / 3750)

    return train_mel, test_mel, train_set_labels, test_set_labels, test_set_id
def process_wav(wav_path, audio_path, mel_path, params):
    wav = load_wav(wav_path,
                   sample_rate=params["preprocessing"]["sample_rate"])
    wav /= np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         preemph=params["preprocessing"]["preemph"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])

    length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav)
    wav = np.pad(wav, (0, length_diff), "constant")

    pad = (params["vocoder"]["sample_frames"] -
           params["vocoder"]["audio_slice_frames"]) // 2
    mel = np.pad(mel, ((pad, ), (0, )), "constant")
    wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"], ),
                 "constant")
    wav = mulaw_encode(wav, mu=2**params["preprocessing"]["bits"])

    speaker = os.path.splitext(os.path.split(wav_path)[-1])[0].split("_")[0]
    np.save(audio_path, wav)
    np.save(mel_path, mel)
    return speaker, audio_path, mel_path, len(mel)
Esempio n. 3
0
def process_wav(dataset, wav_path, audio_path, mel_path, params):
    """Convert wav_path into speaker_id and internally save processed data in arg's pathes.
    """
    # auto resample based on params (internally, librosa)
    wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"])
    wav /= np.abs(wav).max() * 0.999
    mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])

    length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav)
    wav = np.pad(wav, (0, length_diff), "constant")

    pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2
    mel = np.pad(mel, ((pad,), (0,)), "constant")
    wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"],), "constant")
    wav = mulaw_encode(wav, mu=2 ** params["preprocessing"]["bits"])

    # speakerID acuisition
    speaker = get_speakerid(wav_path, dataset)

    # save processed data
    np.save(audio_path, wav)
    np.save(mel_path, mel)
    
    return speaker, audio_path, mel_path, len(mel)
Esempio n. 4
0
def _process_utterance(output_dir, chunk, wav_name, target_class, index):
    mel_spectrogram = utils.melspectrogram(chunk).astype(np.float32)
    mel_filename = "mel-%s-%s.npy" % (wav_name.split('.')[0], index)
    trimmed_wav_name = "%s-%s.wav" % (wav_name.split('.')[0], index)
    utils.save_feature(mel_spectrogram.T, os.path.join(output_dir,
                                                       mel_filename))
    #utils.save_wav(chunk, os.path.join(output_dir, trimmed_wav_name))
    return (trimmed_wav_name, mel_filename, target_class)
Esempio n. 5
0
def preprocess_one(wav):
    """
    Devide wav to chunks
    """
    chunked_features = []
    for (start, end) in utils.windows(wav, hparams.window_size):
        chunk = wav[start:end]
        if (len(chunk) != hparams.window_size):
            chunk = utils.pad_chunk(chunk, wav)
        mel_spectrogram = utils.melspectrogram(chunk).astype(np.float32)
        chunked_features.append(mel_spectrogram.T)
    return np.array(chunked_features)
Esempio n. 6
0
def gen_from_wav(model, wav, output):
    wav = load_wav(wav, params["preprocessing"]["sample_rate"], trim=False)
    utterance_id = os.path.basename(args.input).split(".")[0]
    wav = wav / np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         ref_level_db=params["preprocessing"]["ref_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         fmin=params["preprocessing"]["fmin"],
                         fmax=params["preprocessing"]["fmax"])
    gen_from_mel(model, mel, output)
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser('PreprocessingParser')
    parser.add_argument('--data_dir', type=str, help='data root directory')
    parser.add_argument('--save_dir',
                        type=str,
                        help='extracted feature save directory')
    parser.add_argument('--dev_rate',
                        type=float,
                        help='dev set rate',
                        default=0.05)
    parser.add_argument('--test_rate',
                        type=float,
                        help='test set rate',
                        default=0.05)
    args = parser.parse_args()
    # args validation
    if args.dev_rate < 0 or args.dev_rate >= 1:
        raise ValueError('dev rate should be in [0, 1)')
    if args.test_rate < 0 or args.test_rate >= 1:
        raise ValueError('dev rate should be in [0, 1)')
    if args.test_rate + args.dev_rate >= 1:
        raise ValueError('dev rate + test rate should not be >= 1.')
    if not os.path.isdir(args.data_dir):
        raise FileNotFoundError('Directory {} not found!'.format(
            args.data_dir))
    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
    mel_dir = os.path.join(args.save_dir, 'mels')
    os.makedirs(mel_dir, exist_ok=True)
    linear_dir = os.path.join(args.save_dir, 'linears')
    os.makedirs(linear_dir, exist_ok=True)
    f0_dir = os.path.join(args.save_dir, 'f0s')
    os.makedirs(f0_dir, exist_ok=True)
    ppg_dir = os.path.join(args.save_dir, 'ppgs')
    os.makedirs(ppg_dir, exist_ok=True)
    for mode in ['train', 'dev', 'test']:
        if os.path.isfile(
                os.path.join(args.save_dir, "{}_meta.csv".format(mode))):
            os.remove(os.path.join(args.save_dir, "{}_meta.csv".format(mode)))
    wav_files = []
    for rootdir, subdir, files in os.walk(args.data_dir):
        for f in files:
            if f.endswith('.wav'):
                wav_files.append(os.path.join(rootdir, f))
    random.shuffle(wav_files)

    print('Set up PPGs extraction network')
    # Set up network
    ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier
    mfcc_pl = tf.placeholder(dtype=tf.float32,
                             shape=[None, None, 3 * hps.Audio.n_mfcc],
                             name='mfcc_pl')
    ppg_extractor = CNNBLSTMClassifier(
        out_dims=hps.Audio.ppg_dim,
        n_cnn=ppg_extractor_hps.n_cnn,
        cnn_hidden=ppg_extractor_hps.cnn_hidden,
        cnn_kernel=ppg_extractor_hps.cnn_kernel,
        n_blstm=ppg_extractor_hps.n_blstm,
        lstm_hidden=ppg_extractor_hps.lstm_hidden)
    predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits']

    # set up a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # load saved model
    saver = tf.train.Saver()
    print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt))
    saver.restore(sess, ppg_extractor_hps.ckpt)
    print('Extracting mel-spectrograms, spectrograms and log-f0s...')
    train_set = []
    dev_set = []
    test_set = []
    dev_start_idx = int(len(wav_files) * (1 - args.dev_rate - args.test_rate))
    test_stat_idx = int(len(wav_files) * (1 - args.test_rate))
    for i, wav_f in tqdm(enumerate(wav_files)):
        try:
            wav_arr = load_wav(wav_f)
        except:
            continue
        pre_emphasized_wav = _preemphasize(wav_arr)
        fid = '{}_{}'.format(
            wav_f.split('/')[-3].split('_')[2],
            wav_f.split('/')[-1].split('.')[0].split('_')[1])
        # extract mel-spectrograms
        mel_fn = os.path.join(mel_dir, '{}.npy'.format(fid))
        try:
            mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T
        except:
            continue
        # extract spectrograms
        linear_fn = os.path.join(linear_dir, '{}.npy'.format(fid))
        try:
            linear_spec = spectrogram(pre_emphasized_wav).astype(np.float32).T
        except:
            continue
        # extract log-f0s
        f0_fn = os.path.join(f0_dir, '{}.npy'.format(fid))
        log_f0 = logf0(wav_f)
        try:
            log_f0 = lf0_normailze(log_f0)
        except:
            continue
        # extract ppgs
        mfcc_feats = wav2unnormalized_mfcc(wav_arr)
        ppg = sess.run(predicted_ppgs,
                       feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)})
        ppg = softmax(np.squeeze(ppg, axis=0))
        ppg_fn = os.path.join(ppg_dir, '{}.npy'.format(fid))

        # save features to respective directory
        mel_spec, linear_spec, log_f0, ppg = length_validate(
            (mel_spec, linear_spec, log_f0, ppg))
        np.save(mel_fn, mel_spec)
        np.save(linear_fn, linear_spec)
        np.save(f0_fn, log_f0)
        np.save(ppg_fn, ppg)

        # write to csv
        if i < dev_start_idx:
            train_set.append(fid)
            with open(os.path.join(args.save_dir, 'train_meta.csv'),
                      'a',
                      encoding='utf-8') as train_f:
                train_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
        elif i < test_stat_idx:
            dev_set.append(fid)
            with open(os.path.join(args.save_dir, 'dev_meta.csv'),
                      'a',
                      encoding='utf-8') as dev_f:
                dev_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
        else:
            test_set.append(fid)
            with open(os.path.join(args.save_dir, 'test_meta.csv'),
                      'a',
                      encoding='utf-8') as test_f:
                test_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
    print('Done extracting features!')
    return
Esempio n. 8
0
        bits=params["preprocessing"]["bits"],
        hop_length=params["preprocessing"]["hop_length"],
        nc=args.nc,
        device=device)
    model.to(device)

    print("Load checkpoint from: {}:".format(args.checkpoint))
    checkpoint = torch.load(args.checkpoint,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["model"])
    model_step = checkpoint["step"]

    wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"])
    utterance_id = os.path.basename(args.wav_path).split(".")[0]
    wav = wav / np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])
    mel = torch.FloatTensor(mel).unsqueeze(0).to(device)
    output = model.generate(mel)
    path = os.path.join(
        args.gen_dir,
        "gen_{}_model_steps_{}.wav".format(utterance_id, model_step))
    save_wav(path, output, params["preprocessing"]["sample_rate"])
Esempio n. 9
0
parser.add_argument(
    'weight_path',
    help="Path of checkpoint (ex:./result/weights/wavenet_0800)")
args = parser.parse_args()


def synthesize(mel_sp, save_path, weight_path):
    wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales)
    wavenet.load_weights(weight_path)
    mel_sp = tf.expand_dims(mel_sp, axis=0)

    outputs = wavenet.synthesis(mel_sp)
    outputs = np.squeeze(outputs)
    outputs = inv_mulaw_quantize(outputs)

    save_wav(outputs, save_path, hparams.sampling_rate)


if __name__ == '__main__':
    wav = load_wav(args.input_path, hparams.sampling_rate)
    wav = normalize(wav) * 0.95

    mel_sp = melspectrogram(wav,
                            hparams.sampling_rate,
                            hparams.num_mels,
                            n_fft=hparams.n_fft,
                            hop_size=hparams.hop_size,
                            win_size=hparams.win_size)

    synthesize(mel_sp, args.output_path, args.weight_path)
Esempio n. 10
0
def main():
    hps = Hparams
    parser = argparse.ArgumentParser('VC inference')
    parser.add_argument('--src_wav', type=str, help='source wav file path')
    parser.add_argument('--ckpt', type=str, help='model ckpt path')
    parser.add_argument('--save_dir', type=str, help='synthesized wav save directory')
    args = parser.parse_args()
    # 0.
    src_wav_arr = load_wav(args.src_wav)
    pre_emphasized_wav = _preemphasize(src_wav_arr)
    # 1. extract ppgs
    ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier
    mfcc_pl = tf.placeholder(dtype=tf.float32,
                             shape=[None, None, 3 * hps.Audio.n_mfcc],
                             name='mfcc_pl')
    ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim,
                                       n_cnn=ppg_extractor_hps.n_cnn,
                                       cnn_hidden=ppg_extractor_hps.cnn_hidden,
                                       cnn_kernel=ppg_extractor_hps.cnn_kernel,
                                       n_blstm=ppg_extractor_hps.n_blstm,
                                       lstm_hidden=ppg_extractor_hps.lstm_hidden)
    predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits']
    # set up a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # load saved model
    saver = tf.train.Saver()
    print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt))
    saver.restore(sess, ppg_extractor_hps.ckpt)
    mfcc_feats = wav2unnormalized_mfcc(src_wav_arr)
    ppg = sess.run(predicted_ppgs,
                   feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)})
    sess.close()
    ppg = softmax(np.squeeze(ppg, axis=0))

    # 2. extract lf0, mel-spectrogram
    log_f0 = logf0(args.src_wav)
    log_f0 = lf0_normailze(log_f0)
    # mel-spectrogram is extracted for comparison
    mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T

    # 3. prepare inputs
    min_len = min(log_f0.shape[0], ppg.shape[0])
    vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1)
    vc_inputs = np.expand_dims(vc_inputs, axis=1)  # [time, batch, dim]

    # 4. setup vc model and do the inference
    model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2,
                                 out_channels=hps.Audio.num_mels,
                                 lstm_hidden=hps.BLSTMConversionModel.lstm_hidden)
    device = torch.device('cpu')
    model.load_state_dict(torch.load(args.ckpt, map_location=device))
    model.eval()
    predicted_mels = model(torch.tensor(vc_inputs))
    predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1)

    # 5. synthesize wav
    synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T))
    resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T))
    ckpt_name = args.ckpt.split('/')[-1].split('.')[0]
    wav_name = args.src_wav.split('/')[-1].split('.')[0]
    save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name)))
    save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name)))
    return