def infer_random(self, gender=False):
        #using the first sample from in_test

        if gender:
            source_sp = random.sample(self.speakers_man, 1)[0]
            target_sp = random.sample(self.speakers_woman, 1)[0]
        else:
            source_sp, target_sp = random.sample(self.speakers, 2)

        source_utt = random.sample(self.indexes[source_sp], 1)[0]
        target_utt = random.sample(self.indexes[target_sp], 1)[0]

        source_sp = '3436'
        target_sp = '6064'
        source_utt = '172162_000012_000003'
        target_utt = '300880_000029_000002'

        # source_sp = '226'
        # target_sp = '238'
        # source_utt = '001'
        # target_utt = '001'
        display = False

        output_image_path = os.path.join(
            self.args.output_image_path, self.args.val_set,
            self.args.load_iteration,
            's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp,
                                     target_utt))
        output_audio_path = os.path.join(
            self.args.output_audio_path, self.args.val_set,
            self.args.load_iteration,
            's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp,
                                     target_utt))

        os.makedirs(output_image_path, exist_ok=True)
        os.makedirs(output_audio_path, exist_ok=True)

        source_spec_ori = self.dataset[f'{source_sp}/{source_utt}/mel'][:]
        target_spec_ori = self.dataset[f'{target_sp}/{target_utt}/mel'][:]

        print('source time {} | target time {}'.format(source_spec_ori.shape,
                                                       target_spec_ori.shape))

        b = 8
        source_spec, source_len_pad = self.pad_seq(source_spec_ori, base=b)
        target_spec, target_len_pad = self.pad_seq(target_spec_ori, base=b)

        source_spec = cc(torch.from_numpy(source_spec))
        target_spec = cc(torch.from_numpy(target_spec))

        source_dspec = source_spec
        target_dspec = target_spec

        wav2wav(self.speaker2filenames[source_sp][source_utt],
                os.path.join(output_audio_path, 'src_ori.wav'))
        wav2wav(self.speaker2filenames[target_sp][target_utt],
                os.path.join(output_audio_path, 'tar_ori.wav'))
        # sync
        self.plot_spectrograms(source_spec_ori,
                               os.path.join(output_image_path, 'src_sync.png'))
        self.wave_generate(source_spec_ori,
                           os.path.join(output_audio_path, 'src_sync'))

        self.plot_spectrograms(target_spec_ori,
                               os.path.join(output_image_path, 'tar_sync.png'))
        self.wave_generate(target_spec_ori,
                           os.path.join(output_audio_path, 'tar_sync'))

        # reconstruction
        source_spec_rec = self.inference_one_utterance(source_spec,
                                                       source_dspec,
                                                       source_len_pad,
                                                       pic_path=os.path.join(
                                                           output_image_path,
                                                           's_indices.txt'))
        self.plot_spectrograms(source_spec_rec,
                               os.path.join(output_image_path, 'src_rec.png'))
        self.wave_generate(source_spec_rec,
                           os.path.join(output_audio_path, 'src_rec'))

        target_spec_rec = self.inference_one_utterance(target_spec,
                                                       target_dspec,
                                                       target_len_pad,
                                                       pic_path=os.path.join(
                                                           output_image_path,
                                                           't_indices.txt'))
        self.plot_spectrograms(target_spec_rec,
                               os.path.join(output_image_path, 'tar_rec.png'))
        self.wave_generate(target_spec_rec,
                           os.path.join(output_audio_path, 'tar_rec'))

        criterion = nn.L1Loss()
        loss_src_rec = criterion(torch.from_numpy(source_spec_rec),
                                 torch.from_numpy(source_spec_ori))
        loss_trg_rec = criterion(torch.from_numpy(target_spec_rec),
                                 torch.from_numpy(target_spec_ori))
        print('Source Rec Loss: {} | Target Rec Loss: {}'.format(
            loss_src_rec, loss_trg_rec))

        # conversion speaker
        s2t_spec = self.inference_one_utterance(source_spec,
                                                target_dspec,
                                                len_pad=source_len_pad)
        self.plot_spectrograms(s2t_spec,
                               os.path.join(output_image_path, 's2t.png'))
        self.wave_generate(s2t_spec, os.path.join(output_audio_path, 's2t'))

        t2s_spec = self.inference_one_utterance(target_spec,
                                                source_dspec,
                                                len_pad=target_len_pad)
        self.plot_spectrograms(t2s_spec,
                               os.path.join(output_image_path, 't2s.png'))
        self.wave_generate(t2s_spec, os.path.join(output_audio_path, 't2s'))

        print('Complete...')

        return
    def infer_random(self,gender=False):
        #using the first sample from in_test

        if gender:
            source_sp = random.sample(self.speakers_man, 1)[0]
            target_sp = random.sample(self.speakers_woman, 1)[0]
        else:
            source_sp, target_sp = random.sample(self.speakers,2)
        
        source_utt = random.sample(self.indexes[source_sp],1)[0]
        target_utt = random.sample(self.indexes[target_sp],1)[0]
                

        if self.args.val_set == 'out_test':
            source_sp = '237'
            target_sp = '225'
            source_utt = '007'
            target_utt = '008'
        # source_sp = '227'
        # target_sp = '312'
        # source_utt = '099'
        # target_utt = '285'
        display = False

        output_image_path = os.path.join(self.args.output_image_path,self.args.val_set,self.args.load_iteration,
                's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt))
        output_audio_path = os.path.join(self.args.output_audio_path,self.args.val_set,self.args.load_iteration,
                's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt))

        os.makedirs(output_image_path,exist_ok=True)
        os.makedirs(output_audio_path,exist_ok=True)
        
        source_spec_ori = self.dataset[f'{source_sp}/{source_utt}/mel'][:]
        target_spec_ori = self.dataset[f'{target_sp}/{target_utt}/mel'][:]

        print('source time {} | target time {}'.format(source_spec_ori.shape, target_spec_ori.shape))
        
        
        b = 32 if self.args.model_type=='AutoVC' else 8
        source_spec, source_len_pad = self.pad_seq(source_spec_ori, base=b)
        target_spec, target_len_pad = self.pad_seq(target_spec_ori, base=b)

        source_spec = cc(torch.from_numpy(source_spec))
        target_spec = cc(torch.from_numpy(target_spec))

        if self.args.model_type=='AdaVAEd':
            target_slices = self.dataset[f'{target_sp}/{target_utt}/dmels'][:]
            source_slices = self.dataset[f'{source_sp}/{source_utt}/dmels'][:]
            target_dspec = self.dataset[f'{target_sp}/{target_utt}/dmel'][:]
            source_dspec = self.dataset[f'{source_sp}/{source_utt}/dmel'][:]   
            target_dspec = cc(torch.from_numpy(np.array([target_dspec[t0:t1] for t0, t1 in target_slices])))
            source_dspec = cc(torch.from_numpy(np.array([source_dspec[t0:t1] for t0, t1 in source_slices]))) 

        else:
            source_dspec = source_spec   
            target_dspec = target_spec

        wav2wav(self.speaker2filenames[source_sp][source_utt],os.path.join(output_audio_path,'src_ori.wav'))
        wav2wav(self.speaker2filenames[target_sp][target_utt],os.path.join(output_audio_path,'tar_ori.wav'))
        # sync
        self.plot_spectrograms(source_spec_ori, os.path.join(output_image_path,'src_sync.png'))
        self.wave_generate(source_spec_ori, os.path.join(output_audio_path,'src_sync'))

        self.plot_spectrograms(target_spec_ori, os.path.join(output_image_path,'tar_sync.png'))
        self.wave_generate(target_spec_ori, os.path.join(output_audio_path,'tar_sync'))

        # reconstruction
        source_spec_rec = self.inference_one_utterance(source_spec, source_dspec, source_len_pad)
        self.plot_spectrograms(source_spec_rec, os.path.join(output_image_path,'src_rec.png'))
        self.wave_generate(source_spec_rec, os.path.join(output_audio_path,'src_rec'))

        target_spec_rec = self.inference_one_utterance(target_spec, target_dspec ,target_len_pad)
        self.plot_spectrograms(target_spec_rec, os.path.join(output_image_path,'tar_rec.png'))
        self.wave_generate(target_spec_rec, os.path.join(output_audio_path,'tar_rec'))

        criterion = nn.L1Loss()
        loss_src_rec = criterion(torch.from_numpy(source_spec_rec), torch.from_numpy(source_spec_ori))
        loss_trg_rec = criterion(torch.from_numpy(target_spec_rec), torch.from_numpy(target_spec_ori))
        print('Source Rec Loss: {} | Target Rec Loss: {}'.format(loss_src_rec, loss_trg_rec))
        
        # conversion speaker
        s2t_spec = self.inference_one_utterance(source_spec, target_dspec, len_pad=source_len_pad)
        self.plot_spectrograms(s2t_spec, os.path.join(output_image_path,'s2t.png'))
        self.wave_generate(s2t_spec, os.path.join(output_audio_path,'s2t'))

        t2s_spec = self.inference_one_utterance(target_spec, source_dspec, len_pad=target_len_pad)
        self.plot_spectrograms(t2s_spec, os.path.join(output_image_path,'t2s.png'))
        self.wave_generate(t2s_spec, os.path.join(output_audio_path,'t2s'))
        
        print('Complete...')

        return