Ejemplo n.º 1
0
def get_latents(selection, args):
    # expand envelopes to latent shape
    rms = args.rms[:, None, None]
    low_onsets = args.kick_onsets[:, None, None]
    high_onsets = args.snare_onsets[:, None, None]

    # get timestamps and labels with laplacian segmentation
    # k is the number of labels the algorithm may use
    # try multiple values with plot=True to see which value correlates best with the sections of the song
    timestamps, labels = ar.laplacian_segmentation(args.audio, args.sr, k=7)

    # a second set of latents for the drop section, the 'selection' variable is the other set for the intro
    drop_selection = ar.load_latents("workspace/cyphept_kelp_drop_latents.npy")
    color_layer = 9

    latents = []
    for (start, stop), l in zip(zip(timestamps, timestamps[1:]), labels):
        start_frame = int(round(start / args.duration * args.n_frames))
        stop_frame = int(round(stop / args.duration * args.n_frames))
        section_frames = stop_frame - start_frame
        section_bars = (stop - start) * (BPM / 60) / 4

        # get portion of latent selection (wrapping around to start)
        latent_selection_slice = ar.wrapping_slice(selection, l, 4)
        # spline interpolation loops through selection slice
        latent_section = ar.spline_loops(latent_selection_slice,
                                         n_frames=section_frames,
                                         n_loops=section_bars / 4)
        # set the color with laplacian segmentation label, (1 latent repeated for entire section in upper layers)
        latent_section[:, color_layer:] = th.cat(
            [selection[[l], color_layer:]] * section_frames)

        # same as above but for the drop latents (with faster loops)
        drop_selection_slice = ar.wrapping_slice(drop_selection, l, 4)
        drop_section = ar.spline_loops(drop_selection_slice,
                                       n_frames=section_frames,
                                       n_loops=section_bars / 2)
        drop_section[:, color_layer:] = th.cat(
            [drop_selection[[l], color_layer:]] * section_frames)

        # merged based on RMS (drop section or not)
        latents.append((1 - rms[start_frame:stop_frame]) * latent_section +
                       rms[start_frame:stop_frame] * drop_section)

    # concatenate latents to correct length & smooth over the junctions
    len_latents = sum([len(l) for l in latents])
    if len_latents != args.n_frames:
        latents.append(
            th.cat([latents[-1][[-1]]] * (args.n_frames - len_latents)))
    latents = th.cat(latents).float()
    latents = ar.gaussian_filter(latents, 3)

    # use onsets to modulate towards latents
    latents = 0.666 * low_onsets * selection[[2]] + (
        1 - 0.666 * low_onsets) * latents
    latents = 0.666 * high_onsets * selection[[1]] + (
        1 - 0.666 * high_onsets) * latents

    latents = ar.gaussian_filter(latents, 1, causal=0.2)
    return latents
def generate(
    ckpt,
    audio_file,
    initialize=None,
    get_latents=None,
    get_noise=None,
    get_bends=None,
    get_rewrites=None,
    get_truncation=None,
    output_dir="./output",
    audioreactive_file="audioreactive/examples/default.py",
    offset=0,
    duration=-1,
    latent_file=None,
    shuffle_latents=False,
    G_res=1024,
    out_size=1024,
    fps=30,
    batch=8,
    dataparallel=False,
    truncation=1.0,
    stylegan1=False,
    noconst=False,
    latent_dim=512,
    n_mlp=8,
    channel_multiplier=2,
    randomize_noise=False,
    ffmpeg_preset="slow",
    base_res_factor=1,
    output_file=None,
    args=None,
):
    # if args is empty (i.e. generate() called directly instead of through __main__)
    # create args Namespace with all locally available variables
    if args is None:
        kwargs = locals()
        args = argparse.Namespace()
        for k, v in kwargs.items():
            setattr(args, k, v)

    # ensures smoothing is independent of frame rate
    ar.set_SMF(args.fps / 30)

    time_taken = time.time()
    th.set_grad_enabled(False)

    audio_dur = rosa.get_duration(filename=audio_file)
    if duration is -1 or audio_dur < duration:
        duration = audio_dur

    n_frames = int(round(duration * fps))
    args.duration = duration
    args.n_frames = n_frames

    if not os.path.exists(f"{audio_file}.npy"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="PySoundFile failed. Trying audioread instead.")
            audio, sr = rosa.load(audio_file, offset=offset, duration=duration)
        print(sr)
        np.save(f"{audio_file}.npy", audio)
    else:
        audio = np.load(f"{audio_file}.npy")
        sr = 22050
    args.audio = audio
    args.sr = sr

    if initialize is not None:
        args = initialize(args)

    # ====================================================================================
    # =========================== generate audiovisual latents ===========================
    # ====================================================================================
    print("\ngenerating latents...")
    if get_latents is None:
        from audioreactive.default import get_latents

    if latent_file is not None:
        latent_selection = ar.load_latents(latent_file)
    else:
        latent_selection = ar.generate_latents(12, ckpt, G_res, noconst,
                                               latent_dim, n_mlp,
                                               channel_multiplier)
    if shuffle_latents:
        random_indices = random.sample(range(len(latent_selection)),
                                       len(latent_selection))
        latent_selection = latent_selection[random_indices]
    np.save("workspace/last-latents.npy", latent_selection.numpy())

    latents = get_latents(selection=latent_selection, args=args).cpu()

    print(f"{list(latents.shape)} amplitude={latents.std()}\n")

    # ====================================================================================
    # ============================ generate audiovisual noise ============================
    # ====================================================================================
    print("generating noise...")
    if get_noise is None:
        from audioreactive.default import get_noise

    noise = []
    range_min, range_max, exponent = get_noise_range(out_size, G_res,
                                                     stylegan1)
    for scale in range(range_min, range_max):
        h = (2 if out_size == 1080 else 1) * 2**exponent(scale)
        w = (2 if out_size == 1920 else 1) * 2**exponent(scale)

        noise.append(
            get_noise(height=h,
                      width=w,
                      scale=scale - range_min,
                      num_scales=range_max - range_min,
                      args=args))

        if noise[-1] is not None:
            print(list(noise[-1].shape), f"amplitude={noise[-1].std()}")
        gc.collect()
        th.cuda.empty_cache()
    print()

    # ====================================================================================
    # ================ generate audiovisual network bending manipulations ================
    # ====================================================================================
    if get_bends is not None:
        print("generating network bends...")
        bends = get_bends(args=args)
    else:
        bends = []

    # ====================================================================================
    # ================ generate audiovisual model rewriting manipulations ================
    # ====================================================================================
    if get_rewrites is not None:
        print("generating model rewrites...")
        rewrites = get_rewrites(args=args)
    else:
        rewrites = {}

    # ====================================================================================
    # ========================== generate audiovisual truncation =========================
    # ====================================================================================
    if get_truncation is not None:
        print("generating truncation...")
        truncation = get_truncation(args=args)
    else:
        truncation = float(truncation)

    # ====================================================================================
    # ==== render the given (latent, noise, bends, rewrites, truncation) interpolation ===
    # ====================================================================================
    gc.collect()
    th.cuda.empty_cache()

    generator = load_generator(
        ckpt=ckpt,
        is_stylegan1=stylegan1,
        G_res=G_res,
        out_size=out_size,
        noconst=noconst,
        latent_dim=latent_dim,
        n_mlp=n_mlp,
        channel_multiplier=channel_multiplier,
        dataparallel=dataparallel,
        base_res_factor=base_res_factor,
    )

    print(f"\npreprocessing took {time.time() - time_taken:.2f}s\n")

    print(f"rendering {n_frames} frames...")
    if output_file is None:
        checkpoint_title = ckpt.split("/")[-1].split(".")[0].lower()
        track_title = audio_file.split("/")[-1].split(".")[0].lower()
        output_file = f"{output_dir}/{track_title}_{checkpoint_title}_{uuid.uuid4().hex[:8]}.mp4"
    render.render(
        generator=generator,
        latents=latents,
        noise=noise,
        audio_file=audio_file,
        offset=offset,
        duration=duration,
        batch_size=batch,
        truncation=truncation,
        bends=bends,
        rewrites=rewrites,
        out_size=out_size,
        output_file=output_file,
        randomize_noise=randomize_noise,
        ffmpeg_preset=ffmpeg_preset,
    )

    print(f"\ntotal time taken: {(time.time() - time_taken)/60:.2f} minutes")