Ejemplo n.º 1
0
def run(content_fname,
        style_fname,
        output_fname,
        n_fft=2048,
        hop_length=256,
        alpha=0.02,
        n_layers=1,
        n_filters=8192,
        k_w=15,
        stride=1,
        iterations=300,
        phase_iterations=500,
        sr=22050,
        signal_length=1,  # second
        block_length=1024):

    content, sr = read_audio_spectum(
        content_fname, n_fft=n_fft, hop_length=hop_length, sr=sr)
    style, sr = read_audio_spectum(
        style_fname, n_fft=n_fft, hop_length=hop_length, sr=sr)

    n_frames = min(content.shape[0], style.shape[0])
    n_samples = content.shape[1]
    content = content[:n_frames, :]
    style = style[:n_frames, :]

    content_features, style_features, kernels = compute_features(
        content=content,
        style=style,
        stride=stride,
        n_layers=n_layers,
        n_filters=n_filters,
        k_w=k_w)

    result = compute_stylization(
        kernels=kernels,
        n_samples=n_samples,
        n_frames=n_frames,
        content_features=content_features,
        style_features=style_features,
        stride=stride,
        n_layers=n_layers,
        alpha=alpha,
        iterations=iterations)

    mags = np.zeros_like(content.T)
    mags[:, :n_frames] = np.exp(result[0, 0].T) - 1

    p = 2 * np.pi * np.random.random_sample(mags.shape) - np.pi
    for i in range(phase_iterations):
        S = mags * np.exp(1j * p)
        x = librosa.istft(S, hop_length)
        p = np.angle(librosa.stft(x, n_fft, hop_length))

    librosa.output.write_wav('prelimiter.wav', x, sr)
    limited = utils.limiter(x)
    librosa.output.write_wav(output_fname, limited, sr)
Ejemplo n.º 2
0
def run(content_fname,
        style_fname,
        output_fname,
        n_fft=4096,
        n_layers=1,
        n_filters=4096,
        hop_length=256,
        alpha=0.05,
        k_w=15,
        k_h=3,
        optimizer='bfgs',
        stride=1,
        iterations=300,
        sr=22050):

    frame_size = n_fft // 2

    audio, fs = librosa.load(content_fname, sr=sr)
    content = chop(audio, hop_size=hop_length, frame_size=frame_size)
    audio, fs = librosa.load(style_fname, sr=sr)
    style = chop(audio, hop_size=hop_length, frame_size=frame_size)

    n_frames = min(content.shape[0], style.shape[0])
    n_samples = min(content.shape[1], style.shape[1])
    content = content[:n_frames, :n_samples]
    style = style[:n_frames, :n_samples]

    content_features, style_gram, kernels, freqs = compute_features(
        content=content,
        style=style,
        stride=stride,
        n_fft=n_fft,
        n_layers=n_layers,
        n_filters=n_filters,
        k_w=k_w,
        k_h=k_h)

    result = compute_stylization(kernels=kernels,
                                 freqs=freqs,
                                 n_samples=n_samples,
                                 n_frames=n_frames,
                                 n_fft=n_fft,
                                 content_features=content_features,
                                 style_gram=style_gram,
                                 stride=stride,
                                 n_layers=n_layers,
                                 alpha=alpha,
                                 optimizer=optimizer,
                                 iterations=iterations)

    s = unchop(result, hop_size=hop_length, frame_size=frame_size)
    librosa.output.write_wav(output_fname, s, sr=sr)
    s = utils.limiter(s)
    librosa.output.write_wav(output_fname + '.limiter.wav', s, sr=sr)
def run(content_fname,
        style_fname,
        output_path,
        model,
        iterations=100,
        sr=16000,
        hop_size=512,
        frame_size=2048,
        alpha=1e-3):

    content, fs = librosa.load(content_fname, sr=sr)
    style, fs = librosa.load(style_fname, sr=sr)
    n_samples = (min(content.shape[0], style.shape[0]) // 512) * 512
    content = utils.chop(content[:n_samples], hop_size, frame_size)
    style = utils.chop(style[:n_samples], hop_size, frame_size)

    if model == 'encoder':
        content_features, style_features = compute_wavenet_encoder_features(
            content=content, style=style)
        result = compute_wavenet_encoder_stylization(
            n_frames=content_features[0].shape[0],
            n_samples=frame_size,
            alpha=alpha,
            content_features=content_features,
            style_features=style_features,
            iterations=iterations)
    elif model == 'decoder':
        content_features, style_features = compute_wavenet_decoder_features(
            content=content, style=style)
        result = compute_wavenet_decoder_stylization(
            n_frames=content_features[0].shape[0],
            n_samples=frame_size,
            alpha=alpha,
            content_features=content_features,
            style_features=style_features,
            iterations=iterations)
    else:
        raise ValueError('Unsupported model type: {}.'.format(model))

    x = utils.unchop(result, hop_size, frame_size)
    librosa.output.write_wav('prelimiter.wav', x, sr)

    limited = utils.limiter(x)
    output_fname = '{}/{}+{}.wav'.format(output_path,
                                         content_fname.split('/')[-1],
                                         style_fname.split('/')[-1])
    librosa.output.write_wav(output_fname, limited, sr=sr)