def generate_world_features(filenames, data_dir):
    """Code for creating and saving world features and sample labels"""

    world_dir = os.path.join(data_dir, 'world')
    f0_dir = os.path.join(data_dir, 'f0')
    labels_dir = os.path.join(data_dir, "labels")

    if not os.path.exists(world_dir):
        os.mkdir(world_dir)
    if not os.path.exists(f0_dir):
        os.mkdir(f0_dir)
    if not os.path.exists(labels_dir):
        os.mkdir(labels_dir)

    MIN_LENGTH = 0  # actual is 59
    MAX_LENGTH = 1719
    worlds_made = 0

    for i, f in enumerate(filenames):

        wav, labels = get_wav_and_labels(f, data_dir)
        wav = np.array(wav, dtype=np.float64)
        labels = np.array(labels)

        coded_sp_name = os.path.join(world_dir, f[:-4] + ".npy")
        label_name = os.path.join(labels_dir, f[:-4] + ".npy")
        f0_name = os.path.join(f0_dir, f[:-4] + ".npy")
        if os.path.exists(coded_sp_name) and os.path.exists(
                label_name) and os.path.exists(f0_name):
            worlds_made += 1
            continue

        # Ignores data sample if wrong emotion
        if labels[0] != -1:
            f0, ap, sp, coded_sp = cal_mcep(wav)

            # Ignores data sample sample is too long
            if coded_sp.shape[1] < MAX_LENGTH:

                np.save(os.path.join(world_dir, f[:-4] + ".npy"), coded_sp)
                np.save(os.path.join(labels_dir, f[:-4] + ".npy"), labels)
                np.save(os.path.join(f0_dir, f[:-4] + ".npy"), f0)

                worlds_made += 1

        if i % 10 == 0:
            print(i, " complete.")
            print(worlds_made, "worlds made.")
def _single_conversion(filename, model, one_hot_emo):
    '''
    THIS WON'T WORK RIGHT NOW, USE THE WORLD CONVERSION LOOP IN MAIN

    Call only from __main__ section in this module. Generates sample converted
    into each emotion.

    (str) filename - name.wav file to be converted
    (StarGAN-emo-VC1) model - pretrained model to perform conversion
    (torch.Tensor(long)) one_hot_emo - one hot encoding of emotion to convert to
    '''
    wav, labels = pp.get_wav_and_labels(filenames[5],
                                        config['data']['dataset_dir'])
    wav = np.array(wav, dtype=np.double)

    f0, ap, sp, coded_sp = pw.cal_mcep(wav)

    coded_sp = coded_sp.T

    coded_sp_torch = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to(
        device=device)

    fake = model.G(coded_sp_torch, one_hot_emo.unsqueeze(0))
    fake = fake.squeeze()

    print("Sampled size = ", fake.size())

    converted_sp = fake.cpu().detach().numpy()
    converted_sp = np.array(converted_sp, dtype=np.float64)

    sample_length = converted_sp.shape[0]
    if sample_length != ap.shape[0]:
        ap = np.ascontiguousarray(ap[0:sample_length, :], dtype=np.float64)
        f0 = np.ascontiguousarray(f0[0:sample_length], dtype=np.float64)

    f0 = np.ascontiguousarray(f0[20:-20], dtype=np.float64)
    ap = np.ascontiguousarray(ap[20:-20, :], dtype=np.float64)
    converted_sp = np.ascontiguousarray(converted_sp[40:-40, :],
                                        dtype=np.float64)

    coded_sp = np.ascontiguousarray(coded_sp[20:-20, :], dtype=np.float64)

    target = np.argmax(one_hot_emo)
    out_name = filename[:-4] + str(labels[1]) + "to" + target + ".wav"

    audio_utils.save_world_wav([f0, ap, sp, converted_sp], out_name)
def generate_f0_stats(filenames, data_dir):
    """Generate absolute and relative f0 dictionary"""

    NUM_SPEAKERS = 10
    NUM_EMOTIONS = 4
    f0_dir = os.path.join(data_dir, 'f0')

    # CALCULATE ABSOLUTE F0 STATS

    emo_stats = {}
    for e in range(NUM_EMOTIONS):
        spk_dict = {}
        for s in range(NUM_SPEAKERS):
            f0s = []
            for f in filenames:
                wav, labels = get_wav_and_labels(f, data_dir)
                wav = np.array(wav, dtype=np.float64)
                labels = np.array(labels)
                if labels[0] == e and labels[1] == s:
                    f0_file = os.path.join(f0_dir, f[:-4] + ".npy")
                    if os.path.exists(f0_file):
                        f0 = np.load(f0_file)
                        f0s.append(f0)

            log_f0_mean, f0_std = get_f0_stats(f0s)
            spk_dict[s] = (log_f0_mean, f0_std)
            print(f"Done emotion {e}, speaker {s}.")
        emo_stats[e] = spk_dict

    with open('f0_dict.pkl', 'wb') as absolute_file:
        pickle.dump(emo_stats, absolute_file, pickle.HIGHEST_PROTOCOL)

    print(" ---- Absolute f0 stats completed ----")

    for tag, val in emo_stats.items():
        print(f'Emotion {tag} stats:')
        for tag2, val2 in val.items():
            print(f'{tag2} = {val2[0]}, {val2[1]}')

    # CALCULATE RELATIVE F0 STATS

    emo2emo_dict = {}

    for e1 in range(NUM_EMOTIONS):

        emo2emo_dict[e1] = {}

        for e2 in range(NUM_EMOTIONS):

            mean_list = []
            std_list = []

            for s in range(NUM_SPEAKERS):
                mean_diff = emo_stats[e2][s][0] - emo_stats[e1][s][0]
                std_diff = emo_stats[e2][s][1] - emo_stats[e1][s][1]
                mean_list.append(mean_diff)
                std_list.append(std_diff)

            mean_mean = np.mean(mean_list)
            std_mean = np.mean(std_list)
            emo2emo_dict[e1][e2] = (mean_mean, std_mean)

    print(" ---- Relative f0 stats completed ----")
    for tag, val in emo2emo_dict.items():
        print(f'Emotion {tag} stats:')
        for tag2, val2 in val.items():
            print(f'{tag2} = {val2[0]}, {val2[1]}')

    with open('f0_relative_dict.pkl', 'wb') as relative_file:
        pickle.dump(emo2emo_dict, relative_file, pickle.HIGHEST_PROTOCOL)
            filenames.append(f)

        print("Converting sample set.")
    else:

        data_dir = os.path.join(config['data']['dataset_dir'], "audio")

        print("Data directory = ", data_dir)
        files = find_files(data_dir, ext='.wav')

        label_dir = os.path.join(config['data']['dataset_dir'], 'labels')
        num_emos = config['model']['num_classes']

        # filenames = [f + ".wav" for f in files]
        filenames = [
            f for f in files if -1 < pp.get_wav_and_labels(
                f, config['data']['dataset_dir'])[1][0] < num_emos
        ]
        filenames = [
            os.path.join(config['data']['dataset_dir'], f) for f in filenames
        ][:10]

        files = my_dataset.shuffle(files)

        train_test_split = config['data']['train_test_split']
        split_index = int(len(files) * train_test_split)
        filenames = files[split_index:]

        print("Converting 10 random test set samples.")
        print(filenames)
    # for one_hot in emo_targets:
    #     _single_conversion(filenames[0], model, one_hot)
Esempio n. 5
0
    #
    # n, bins, patches = plt.hist(lengths, bins = 22)
    # plt.xlabel('Sequence length')
    # plt.ylabel('Count')
    # plt.title(r'New histogram of sequence lengths for 4 emotional categories')
    # plt.show()
    ############################################
    #            Generate f0_dict              #
    ############################################
    emo_stats = {}
    for e in range(0, 4):
        spk_dict = {}
        for s in range(0, 10):
            f0s = []
            for f in filenames:
                wav, labels = pp.get_wav_and_labels(f, data_dir)
                wav = np.array(wav, dtype=np.float64)
                labels = np.array(labels)
                if labels[0] == e and labels[1] == s:
                    f0_dir = data_dir + "/f0/" + f[:-4] + ".npy"
                    f0 = np.load(f0_dir)
                    f0s.append(f0)

            log_f0_mean, f0_std = get_f0_stats(f0s)
            spk_dict[s] = (log_f0_mean, f0_std)
            print(f"Done emotion {e}, speaker {s}.")
        emo_stats[e] = spk_dict

    with open('f0_dict.pkl', 'wb') as f:
        pickle.dump(emo_stats, f, pickle.HIGHEST_PROTOCOL)