def __init__(self, config):

        self.config = config

        self.filenames = librosa.util.find_files(config['data']['sample_set_dir'])

        self.set = {}

        for file in self.filenames:

            filename = os.path.basename(file)
            if filename[-1] == '/':
                filename = filename[0:-1]

            audio, labels = pp.get_samples_and_labels(filename, config)

            if config['data']['type'] == 'mel':

                spec = torch.Tensor(audio_utils.wav2spectrogram(audio)).t()
                melspec = torch.Tensor(audio_utils.wav2melspectrogram(audio)).t()

                self.set[filename] = [melspec, labels, spec]
            else:

                audio = np.array(audio, dtype = np.float64)

                f0, ap, sp, coded_sp = pw.cal_mcep(audio)
                coded_sp = torch.Tensor(coded_sp.T)
                self.set[filename] = [f0, ap, sp, coded_sp, labels]
def _single_conversion(filename, model, one_hot_emo):
    '''
    THIS WON'T WORK RIGHT NOW, USE THE WORLD CONVERSION LOOP IN MAIN
    
    Call only from __main__ section in this module. Generates sample converted
    into each emotion.

    (str) filename - name.wav file to be converted
    (StarGAN-emo-VC1) model - pretrained model to perform conversion
    (torch.Tensor(long)) one_hot_emo - one hot encoding of emotion to convert to
    '''
    wav, labels = pp.get_wav_and_labels(filenames[5],
                                        config['data']['dataset_dir'])
    wav = np.array(wav, dtype=np.double)

    f0, ap, sp, coded_sp = preprocess_world.cal_mcep(wav)

    coded_sp = coded_sp.T

    coded_sp_torch = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to(
        device=device)

    fake = model.G(coded_sp_torch, one_hot_emo.unsqueeze(0))
    fake = fake.squeeze()

    print("Sampled size = ", fake.size())

    converted_sp = fake.cpu().detach().numpy()
    converted_sp = np.array(converted_sp, dtype=np.float64)

    sample_length = converted_sp.shape[0]
    if sample_length != ap.shape[0]:
        ap = np.ascontiguousarray(ap[0:sample_length, :], dtype=np.float64)
        f0 = np.ascontiguousarray(f0[0:sample_length], dtype=np.float64)

    f0 = np.ascontiguousarray(f0[20:-20], dtype=np.float64)
    ap = np.ascontiguousarray(ap[20:-20, :], dtype=np.float64)
    converted_sp = np.ascontiguousarray(converted_sp[40:-40, :],
                                        dtype=np.float64)

    coded_sp = np.ascontiguousarray(coded_sp[20:-20, :], dtype=np.float64)

    target = np.argmax(one_hot_emo)
    out_name = filename[:-4] + str(labels[1]) + "to" + target + ".wav"

    audio_utils.save_world_wav([f0, ap, sp, converted_sp],
                               model.name + '_converted', out_name)
    #     # coded_sp_temp = np.copy(coded_sp).T
    #     # print(coded_sp_temp.shape)
    #     filename_wav =  f[0:-4] + "_" + str(int(labels[0].item())) + ".wav"
    #     print(coded_sp.shape)
    #     it = str(args.iteration)[0:3]
    #     audio_utils.save_world_wav([f0,ap,sp,coded_sp], args.out_dir + '_evalSet', filename_wav)

    ########################################
    #        WORLD CONVERSION LOOP         #
    ########################################
    for file_num, f in enumerate(filenames):

        # wav, labels = pp.get_wav_and_labels(f, config['data']['dataset_dir'])
        wav = np.array(wav, dtype = np.float64)
        labels = np.array(labels)
        f0_real, ap_real, sp, coded_sp = preprocess_world.cal_mcep(wav)
        # coded_sp_temp = np.copy(coded_sp).T
        # print(coded_sp_temp.shape)
        coded_sp = coded_sp.T
        coded_sp = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to(device = device)

        with torch.no_grad():
            # print(emo_targets)
            for i in range (0, emo_targets.size(0)):


                f0 = np.copy(f0_real)
                ap = np.copy(ap_real)
                # coded_sp_temp_copy = np.copy(coded_sp_temp)
                # coded_sp = np.copy(coded_sp)
                f0 = audio_utils.f0_pitch_conversion(f0, (labels[0],labels[1]),