def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, help="filename of hdf5 format") args = parser.parse_args() # read list and define scaler filenames = read_txt(args.feats) scaler = StandardScaler() print("number of training utterances =", len(filenames)) # process over all of data for filename in filenames: feat = read_hdf5(filename, "/feat_org") scaler.partial_fit(feat[:, 1:]) # add uv term mean = np.zeros((feat.shape[1])) scale = np.ones((feat.shape[1])) mean[1:] = scaler.mean_ scale[1:] = scaler.scale_ # write to hdf5 write_hdf5(args.stats, "/mean", mean) write_hdf5(args.stats, "/scale", scale)
def melcepstrum_extract(wav_list, args): """EXTRACT MEL CEPSTRUM""" # define feature extractor for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warn("wav file format is not 16 bit PCM.") x = np.array(x, dtype=np.float64) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features shiftl = int(args.shiftms * fs * 0.001) mcep = stft_mcep(x, args.fftl, shiftl, args.mcep_dim, args.mcep_alpha) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/mcep", np.float32(mcep)) # overwrite wav file if args.highpass_cutoff != 0 and args.save_wav: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def world_feature_extract(wav_list, args): """EXTRACT WORLD FEATURE VECTOR""" # define feature extractor feature_extractor = FeatureExtractor(analyzer="world", fs=args.fs, shiftms=args.shiftms, minf0=args.minf0, maxf0=args.maxf0, fftl=args.fftl) for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) x = np.array(x, dtype=np.float32) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features f0, _, _ = feature_extractor.analyze(x) uv, cont_f0 = convert_continuos_f0(f0) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) codeap = feature_extractor.codeap() mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha) # concatenate cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) uv = np.expand_dims(uv, axis=-1) feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/feat_org", feats) if args.save_extended: # extend time resolution upsampling_factor = int(args.shiftms * fs * 0.001) feats_extended = extend_time(feats, upsampling_factor) feats_extended = feats_extended.astype(np.float32) write_hdf5(hdf5name, "/feat", feats_extended) # overwrite wav file if args.highpass_cutoff != 0: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def feature_extract(wav_list): for wav_name in wav_list: # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) x = np.array(x, dtype=np.float32) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: print("ERROR: sampling frequency is not matched.") sys.exit(1) # extract features f0, spc, ap = feature_extractor.analyze(x) uv, cont_f0 = convert_continuos_f0(f0) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) codeap = feature_extractor.codeap() mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha) # concatenate cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) uv = np.expand_dims(uv, axis=-1) feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1) # extend time resolution upsampling_factor = int(args.shiftms * fs * 0.001) feats_extended = extend_time(feats, upsampling_factor) # save to hdf5 feats_extended = feats_extended.astype(np.float32) hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/feat_org", feats) write_hdf5(hdf5name, "/feat", feats_extended) # overwrite wav file if args.highpass_cutoff != 0: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def calc_stats(file_list, args): """CALCULATE STATISTICS""" scaler = StandardScaler() # process over all of data for i, filename in enumerate(file_list): logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) feat = read_hdf5(filename, "/" + args.feature_type) scaler.partial_fit(feat) # add uv term mean = scaler.mean_ scale = scaler.scale_ if args.feature_type == "world": mean[0] = 0.0 scale[0] = 1.0 # write to hdf5 write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean)) write_hdf5(args.stats, "/" + args.feature_type + "/scale", np.float32(scale))
def calc_stats(file_list, args): """CALCULATE STATISTICS""" scaler = StandardScaler() # process over all of data for i, filename in enumerate(file_list): logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) feat = read_hdf5(filename, "/%s" % args.feature_type) scaler.partial_fit(feat[:, 1:]) # add uv term mean = np.zeros((feat.shape[1])) scale = np.ones((feat.shape[1])) mean[1:] = scaler.mean_ scale[1:] = scaler.scale_ # write to hdf5 write_hdf5(args.stats, "/%s/mean" % args.feature_type, mean) write_hdf5(args.stats, "/%s/scale" % args.feature_type, scale)
def melspectrogram_extract(wav_list, args): """EXTRACT MEL SPECTROGRAM""" # define feature extractor for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warn("wav file format is not 16 bit PCM.") x = np.array(x, dtype=np.float64) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features x_norm = x / (np.iinfo(np.int16).max + 1) shiftl = int(args.shiftms * fs * 0.001) mspc = librosa.feature.melspectrogram(x_norm, fs, n_fft=args.fftl, hop_length=shiftl, n_mels=args.mspc_dim, power=1.0) mspc = np.log10(np.maximum(EPS, mspc.T)) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/melspc", np.float32(mspc)) # overwrite wav file if args.highpass_cutoff != 0 and args.save_wav: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def save_DRIVE_to_h5py(train_test, num, config): images_path = config.get('DRIVE', train_test + '_images_path') labels_path = config.get('DRIVE', train_test + '_labels_path') masks_path = config.get('DRIVE', train_test + '_masks_path') height = int(config.get('DRIVE', 'height')) width = int(config.get('DRIVE', 'width')) images = np.empty((num, height, width, 3), dtype=np.float32) labels = np.empty((num, height, width, 1), dtype=np.float32) masks = np.empty((num, height, width, 1), dtype=np.float32) files = os.listdir(images_path) for i in range(len(files)): # read i-th image. images[i] = np.asarray(Image.open(images_path + files[i])) # read corresponding label. label_name = files[i][0:2] + "_manual1.gif" labels[i] = np.asarray(Image.open(labels_path + label_name)).reshape( (height, width, 1)) # read corresponding mask. mask_name = "" if train_test == "train": mask_name = files[i][0:2] + "_training_mask.gif" elif train_test == "test": mask_name = files[i][0:2] + "_test_mask.gif" masks[i] = np.asarray(Image.open(masks_path + mask_name)).reshape( (height, width, 1)) """ Check data. """ print('DRIVE', train_test) print('images', images.shape, images.dtype, np.min(images), np.max(images)) print('labels', labels.shape, labels.dtype, np.min(labels), np.max(labels)) print('masks', masks.shape, masks.dtype, np.min(masks), np.max(masks)) """ Visualize datasets to check integrity.""" """ visualize(group_images(images, 5), save_path = './logs/DRIVE_' + train_test + '_images.png') visualize(group_images(labels, 5), save_path = './logs/DRIVE_' + train_test + '_labels.png') visualize(group_images(masks, 5), save_path = './logs/DRIVE_' + train_test + '_masks.png') """ #visualize(group_images(images, 4)).show() #visualize(group_images(labels, 4)).show() #visualize(group_images(masks, 4)).show() save_path = config.get('DRIVE', 'h5py_save_path') if os.path.exists(save_path) == False: os.system('mkdir {}'.format(save_path)) write_hdf5(images, save_path + train_test + '_images' + '.hdf5') write_hdf5(labels, save_path + train_test + '_labels' + '.hdf5') write_hdf5(masks, save_path + train_test + '_masks' + '.hdf5')
def save_IVDM_to_h5py(train_test, num, config): images_path = config.get('IVDM', train_test + '_images_path') labels_path = config.get('IVDM', train_test + '_labels_path') masks_path = config.get('IVDM', train_test + '_masks_path') height = int(config.get('IVDM', 'height')) width = int(config.get('IVDM', 'width')) #创建给定形状和类型的空数组 images = np.empty((num, height, width, 1), dtype=np.float32) labels = np.empty((num, height, width, 1), dtype=np.float32) masks = np.empty((num, height, width, 1), dtype=np.float32) files = os.listdir(images_path) for i in range(len(files)): #读取腰椎图像 images[i] = np.asarray(Image.open(images_path + files[i])).reshape( (height, width, 1)) #读取标签 label_name = files[i][0:3] + "_manual.png" labels[i] = np.asarray(Image.open(labels_path + label_name)).reshape( (height, width, 1)) #读取掩模 mask_name = "" if train_test == "train": mask_name = files[i][0:3] + "_training_mask.png" elif train_test == "test": mask_name = files[i][0:3] + "_test_mask.png" masks[i] = np.asarray(Image.open(masks_path + mask_name)).reshape( (height, width, 1)) #打印数据信息 print('IVDM', train_test) print('images', images.shape, images.dtype, np.min(images), np.max(images)) print('labels', labels.shape, labels.dtype, np.min(labels), np.max(labels)) print('masks', masks.shape, masks.dtype, np.min(masks), np.max(masks)) #保存为.hdf5文件 save_path = config.get('IVDM', 'h5py_save_path') if os.path.exists(save_path) == False: os.system('mkdir {}'.format(save_path)) write_hdf5(images, save_path + train_test + '_images' + '.hdf5') write_hdf5(labels, save_path + train_test + '_labels' + '.hdf5') write_hdf5(masks, save_path + train_test + '_masks' + '.hdf5')
def save_CHASEDB_to_h5py(train_test, num, config): images_path = config.get('CHASEDB', train_test + '_images_path') labels_path = config.get('CHASEDB', train_test + '_labels_path') masks_path = config.get('CHASEDB', train_test + '_masks_path') height = int(config.get('CHASEDB', 'height')) width = int(config.get('CHASEDB', 'width')) images = np.empty((num, height, width, 3), dtype=np.float32) labels = np.empty((num, height, width, 1), dtype=np.float32) masks = np.empty((num, height, width, 1), dtype=np.float32) files = os.listdir(images_path) for i in range(len(files)): # read i-th image. images[i] = np.asarray(Image.open(images_path + files[i])) # read corresponding label. label_name = files[i][:9] + "_1stHO.png" labels[i] = np.asarray(Image.open(labels_path + label_name)).reshape( (height, width, 1)) # read corresponding mask. mask_name = 'mask_' + files[i][6:9] + '.png' masks[i] = np.asarray(Image.open(masks_path + mask_name)).reshape( (height, width, 1)) """ Check data. """ print('CHASEDB', train_test) print('images', images.shape, images.dtype, np.min(images), np.max(images)) print('labels', labels.shape, labels.dtype, np.min(labels), np.max(labels)) print('masks', masks.shape, masks.dtype, np.min(masks), np.max(masks)) save_path = config.get('CHASEDB', 'h5py_save_path') if os.path.exists(save_path) == False: os.system('mkdir {}'.format(save_path)) write_hdf5(images, save_path + train_test + '_images' + '.hdf5') write_hdf5(labels, save_path + train_test + '_labels' + '.hdf5') write_hdf5(masks, save_path + train_test + '_masks' + '.hdf5')
rotations[n, k, 0] = ( np.random.random() * (config['render_max_x_rotation'] + abs(config['render_min_x_rotation'])) - abs(config['render_min_x_rotation'])) / 180. * math.pi rotations[n, k, 1] = ( np.random.random() * (config['render_max_y_rotation'] + abs(config['render_min_y_rotation'])) - abs(config['render_min_y_rotation'])) / 180. * math.pi mesh.rotate(rotations[n, k]) mesh.translate(mesh_center) np_vertices = mesh.vertices.astype(np.float64) np_faces = mesh.faces.astype(np.float64) np_faces += 1 depth_map, mask, img = pyrender.render(np_vertices.T.copy(), np_faces.T.copy(), intrinsics, znf, size) depth_maps[n][k] = depth_map print('[Data] rendered %s %d/%d' % (off_files[n], (n + 1), n_files)) utils.write_hdf5(angles_file, rotations) print('[Data] wrote %s' % angles_file) utils.write_hdf5(depth_file, depth_maps) print('[Data] wrote %s' % depth_file)
waited = True print('[Data] waiting for %s' % depth_file) time.sleep(10) # Wait for synchronization. if waited: time.sleep(10) try: # Sometimes signature of HDF5 files is still not available. depths = utils.read_hdf5(depth_file) except IOError: print('[Data] could not read %s' % depth_file) time.sleep(5) # Try again, now it can really fail if file is not there. depths = utils.read_hdf5(depth_file) timer.reset() tsdf = fusion(depths, Rs) tsdf = tsdf[0] utils.write_hdf5(tsdf_file, tsdf) print('[Data] wrote %s (%f seconds)' % (tsdf_file, timer.elapsed())) vertices, triangles = mcubes.marching_cubes(-tsdf, 0) vertices /= config['watertight_fusion']['resolution'] vertices -= 0.5 mcubes.export_off(vertices, triangles, off_file) print('[Data] wrote %s (%f seconds)' % (off_file, timer.elapsed()))
def decode_RNN(wav_list, gpu, cvlist=None, cvlist_src=None, \ mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None): with torch.cuda.device(gpu): mean_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/mean_feat_org_lf0")[config.stdim:]).cuda() std_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/scale_feat_org_lf0")[config.stdim:]).cuda() # define model and load parameters logging.info(config) logging.info("model") with torch.no_grad(): model_encoder = GRU_RNN_STOCHASTIC( in_dim=config.in_dim, out_dim=config.lat_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, arparam=config.arparam, spk_dim=n_spk, causal_conv=config.causal_conv, scale_out_flag=False) model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk, out_dim=config.out_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv, scale_in_flag=False) logging.info(model_encoder) logging.info(model_decoder) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_encoder.cuda() model_decoder.cuda() model_encoder.eval() model_decoder.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False if config.arparam: init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk)) else: init_pp = np.zeros((1, 1, config.lat_dim + n_spk)) y_in_pp = torch.FloatTensor(init_pp).cuda() y_in_src = y_in_trg = torch.unsqueeze( torch.unsqueeze((0 - mean_trg) / std_trg, 0), 0) fs = args.fs fft_size = args.fftl mcep_dim = model_decoder.out_dim - 1 for wav_file in wav_list: # convert mcep feat_file = os.path.join( args.h5outdir, os.path.basename(wav_file).replace(".wav", ".h5")) logging.info("cvmcep " + feat_file + " " + wav_file) fs, x = read_wav(wav_file, cutoff=70) time_axis, f0, sp, ap = analyze_range(x, fs=fs, minf0=args.minf0, maxf0=args.maxf0, \ fperiod=args.shiftms, fftl=args.fftl) logging.info(sp.shape) mcep = ps.sp2mc(sp, mcep_dim, args.mcep_alpha) logging.info(mcep.shape) codeap = pw.code_aperiodicity(ap, fs) logging.info(codeap.shape) npow = spc2npow(sp) logging.info(npow.shape) _, spcidx = extfrm(mcep, npow, power_threshold=args.pow) spcidx = spcidx[0] logging.info(spcidx.shape) uv, contf0 = convert_continuos_f0(np.array(f0)) uv = np.expand_dims(uv, axis=-1) logging.info(uv.shape) cont_f0_lpf = low_pass_filter(contf0, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0 = np.expand_dims(np.log(cont_f0_lpf), axis=-1) logging.info(logcontf0.shape) feat = np.c_[uv, logcontf0, codeap, mcep] logging.info(feat.shape) logging.info("generate") with torch.no_grad(): lat_feat_src, _, _, _, _ = \ model_encoder(torch.FloatTensor(feat).cuda(), y_in_pp, sampling=False) src_code = np.zeros((lat_feat_src.shape[0], n_spk)) src_code[:, src_code_idx] = 1 src_code = torch.FloatTensor(src_code).cuda() trg_code = np.zeros((lat_feat_src.shape[0], n_spk)) trg_code[:, trg_code_idx] = 1 trg_code = torch.FloatTensor(trg_code).cuda() cvmcep_src, _, _ = model_decoder( torch.cat((src_code, lat_feat_src), 1), y_in_src) cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(), dtype=np.float64) cvmcep, _, _ = model_decoder( torch.cat((trg_code, lat_feat_src), 1), y_in_trg) cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64) logging.info(lat_feat_src.shape) logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) cvf0 = convert_f0(f0, f0_range_mean_src, f0_range_std_src, f0_range_mean_trg, f0_range_std_trg) uv_cv, contf0_cv = convert_continuos_f0(np.array(cvf0)) uv_cv = np.expand_dims(uv_cv, axis=-1) logging.info(uv_cv.shape) cont_f0_lpf_cv = low_pass_filter(contf0_cv, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0_cv = np.expand_dims(np.log(cont_f0_lpf_cv), axis=-1) logging.info(logcontf0_cv.shape) feat_cv = np.c_[uv_cv, logcontf0_cv, codeap] logging.info(feat_cv.shape) feat_cvmcep = np.c_[feat_cv, cvmcep] logging.info(feat_cvmcep.shape) write_path = '/feat_cvmcep_cycvae-' + model_epoch logging.info(feat_file + ' ' + write_path) write_hdf5(feat_file, write_path, feat_cvmcep) cvlist.append(np.var(cvmcep[:, 1:], axis=0)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),:], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info("synth voco") cvsp = ps.mc2sp(np.ascontiguousarray(cvmcep), args.mcep_alpha, fft_size) logging.info(cvsp.shape) wav = np.clip( pw.synthesize(cvf0, cvsp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_cv.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath) logging.info("synth anasyn") wav = np.clip( pw.synthesize(f0, sp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_anasyn.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath)
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, \ f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None, \ f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, \ f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_mcep = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder_mcep) model_decoder_mcep = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_dec) logging.info(model_decoder_mcep) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=config.bi_lf0, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_f0) logging.info(model_decoder_excit) model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim) logging.info(model_vq) model_encoder_mcep.load_state_dict(torch.load(args.model)["model_encoder_mcep"]) model_decoder_mcep.load_state_dict(torch.load(args.model)["model_decoder_mcep"]) model_encoder_excit.load_state_dict(torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict(torch.load(args.model)["model_decoder_excit"]) model_vq.load_state_dict(torch.load(args.model)["model_vq"]) model_encoder_mcep.cuda() model_decoder_mcep.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() model_vq.cuda() model_encoder_mcep.eval() model_decoder_mcep.eval() model_encoder_excit.eval() model_decoder_excit.eval() model_vq.eval() for param in model_encoder_mcep.parameters(): param.requires_grad = False for param in model_decoder_mcep.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False for param in model_vq.parameters(): param.requires_grad = False if config.ar_enc: yz_in = torch.zeros((1, 1, n_spk+config.lat_dim)).cuda() yz_in_e = torch.zeros((1, 1, n_spk+config.lat_dim_e)).cuda() if config.ar_dec or config.ar_f0: mean_stats = torch.FloatTensor(read_hdf5(config.stats, "/mean_"+config.string_path.replace("/",""))) scale_stats = torch.FloatTensor(read_hdf5(config.stats, "/scale_"+config.string_path.replace("/",""))) if config.ar_dec: x_in = ((torch.zeros((1, 1, config.mcep_dim))-mean_stats[config.excit_dim:])/scale_stats[config.excit_dim:]).cuda() if config.ar_f0: e_in = torch.cat((torch.zeros(1,1,1), (torch.zeros(1,1,1)-mean_stats[1:2])/scale_stats[1:2], \ torch.zeros(1,1,1), (torch.zeros(1,1,config.cap_dim)-mean_stats[3:config.excit_dim])/scale_stats[3:config.excit_dim]), 2).cuda() count = 0 pad_left = (model_encoder_mcep.pad_left + model_decoder_mcep.pad_left)*2 pad_right = (model_encoder_mcep.pad_right + model_decoder_mcep.pad_right)*2 outpad_lefts = [None]*3 outpad_rights = [None]*3 outpad_lefts[0] = pad_left-model_encoder_mcep.pad_left outpad_rights[0] = pad_right-model_encoder_mcep.pad_right outpad_lefts[1] = outpad_lefts[0]-model_decoder_mcep.pad_left outpad_rights[1] = outpad_rights[0]-model_decoder_mcep.pad_right outpad_lefts[2] = outpad_lefts[1]-model_encoder_mcep.pad_left outpad_rights[2] = outpad_rights[1]-model_encoder_mcep.pad_right for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2) if config.ar_enc: spk_logits, lat_src, _, _ = model_encoder_mcep(feat, yz_in=yz_in) spk_logits_e, lat_src_e, _, _ = model_encoder_excit(feat, yz_in=yz_in) else: spk_logits, lat_src, _ = model_encoder_mcep(feat) spk_logits_e, lat_src_e, _ = model_encoder_excit(feat) idx_vq = nn_search_batch(lat_src, model_vq.weight) lat_src = model_vq(idx_vq) if outpad_rights[0] > 0: unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) idx_vq_e = nn_search_batch(lat_src_e, model_vq.weight) lat_src_e = model_vq(idx_vq_e) if outpad_rights[0] > 0: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq_e") logging.info(dict(zip(unique, counts))) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:], dim=-1), 1)) src_code = (torch.ones((1, lat_src.shape[1]))*spk_idx).cuda().long() if config.ar_dec: cvmcep_src, _, _ = model_decoder_mcep(src_code, lat_src, x_in=x_in) else: cvmcep_src, _ = model_decoder_mcep(src_code, lat_src) if config.ar_f0: cvlf0_src, _, _ = model_decoder_excit(src_code, lat_src_e, e_in=e_in) else: cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) cv_feat = torch.cat((cvlf0_src, cvmcep_src), 2) if config.ar_enc: spk_logits, lat_rec, _, _ = model_encoder_mcep(cv_feat, yz_in=yz_in) spk_logits_e, lat_rec_e, _, _ = model_encoder_excit(cv_feat, yz_in=yz_in) else: spk_logits, lat_rec, _ = model_encoder_mcep(cv_feat) spk_logits_e, lat_rec_e, _ = model_encoder_excit(cv_feat) idx_vq = nn_search_batch(lat_rec, model_vq.weight) lat_rec = model_vq(idx_vq) if outpad_rights[2] > 0: unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) idx_vq_e = nn_search_batch(lat_rec_e, model_vq.weight) lat_rec_e = model_vq(idx_vq_e) if outpad_rights[2] > 0: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("input vq_e") logging.info(dict(zip(unique, counts))) logging.info('rec spkpost') if outpad_rights[2] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[2] > 0: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:], dim=-1), 1)) src_code = (torch.ones((1, lat_rec.shape[1]))*spk_idx).cuda().long() if config.ar_dec: cvmcep_cyc, _, _ = model_decoder_mcep(src_code, lat_rec, x_in=x_in) else: cvmcep_cyc, _ = model_decoder_mcep(src_code, lat_rec) if config.ar_f0: cvlf0_cyc, _, _ = model_decoder_excit(src_code, lat_rec_e, e_in=e_in) else: cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if outpad_rights[1] > 0: cvmcep_src = cvmcep_src[:,outpad_lefts[1]:-outpad_rights[1]] cvlf0_src = cvlf0_src[:,outpad_lefts[1]:-outpad_rights[1]] else: cvmcep_src = cvmcep_src[:,outpad_lefts[1]:] cvlf0_src = cvlf0_src[:,outpad_lefts[1]:] feat_rec = torch.cat((torch.round(cvlf0_src[:,:,:1]), cvlf0_src[:,:,1:2], \ torch.round(cvlf0_src[:,:,2:3]), cvlf0_src[:,:,3:], cvmcep_src), \ 2)[0].cpu().data.numpy() feat_cyc = torch.cat((torch.round(cvlf0_cyc[:,:,:1]), cvlf0_cyc[:,:,1:2], \ torch.round(cvlf0_cyc[:,:,2:3]), cvlf0_cyc[:,:,3:], cvmcep_cyc), \ 2)[0].cpu().data.numpy() cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmcep_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmcep_cyc.shape) mcep = np.array(feat_org[:,-model_decoder_mcep.out_dim:]) f0 = np.array(np.rint(feat_org[:,0])*np.exp(feat_org[:,1])) codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:feat_org.shape[-1]-model_decoder_mcep.out_dim]))) cvf0_src = np.array(np.rint(cvlf0_src[:,0])*np.exp(cvlf0_src[:,1])) cvcodeap_src = np.array(np.rint(cvlf0_src[:,2:3])*(-np.exp(cvlf0_src[:,3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src-f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt(np.mean((cvcodeap_src-codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i+1, codeap_rmse[i])) cvf0_cyc = np.array(np.rint(cvlf0_cyc[:,0])*np.exp(cvlf0_cyc[:,1])) cvcodeap_cyc = np.array(np.rint(cvlf0_cyc[:,2:3])*(-np.exp(cvlf0_cyc[:,3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc-f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt(np.mean((cvcodeap_cyc-codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i+1, codeap_rmse_cyc[i])) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) mcdpow_cvlist.append(mcdpow_mean) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:,1:], axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:,1:], axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:,1:], axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:,1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk+"-"+args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
print('[Data] reading ' + config_folder + config_file) config = utils.read_json(config_folder + config_file) height = config['height'] width = config['width'] depth = config['depth'] space_file = filename('space_file') space = utils.read_hdf5(space_file) input_file = filename('input_file') input = utils.read_hdf5(input_file) space[input == 1] = 0 if len(space.shape) < 5: space = np.expand_dims(space, axis=1) utils.write_hdf5(space_file, space) print('[Data] wrote ' + space_file) for key in ['input', 'space', 'output', 'sdf', 'input_sdf']: file = filename(key + '_file') volumes = utils.read_hdf5(file) volumes = np.squeeze(volumes) if len(volumes.shape) < 4: volumes = np.expand_dims(volumes, axis = 0) if len(volumes.shape) < 5: utils.write_hdf5(file, np.expand_dims(volumes, axis = 1)) print('[Data] wrote ' + file)
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim + config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, pad_first=True, ar=config.ar_dec) logging.info(model_decoder) model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim) logging.info(model_vq) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_vq.load_state_dict(torch.load(args.model)["model_vq"]) model_encoder.cuda() model_decoder.cuda() model_vq.cuda() model_encoder.eval() model_decoder.eval() model_vq.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_vq.parameters(): param.requires_grad = False if config.ar_enc: yz_in = torch.zeros((1, 1, n_spk + config.lat_dim)).cuda() if config.ar_dec: mean_stats = torch.FloatTensor( read_hdf5( config.stats, "/mean_" + config.string_path.replace("/", ""))) scale_stats = torch.FloatTensor( read_hdf5( config.stats, "/scale_" + config.string_path.replace("/", ""))) x_in = ((torch.zeros((1, 1, config.mcep_dim)) - mean_stats[config.excit_dim:]) / scale_stats[config.excit_dim:]).cuda() count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left) * 2 pad_right = (model_encoder.pad_right + model_decoder.pad_right) * 2 outpad_lefts = [None] * 3 outpad_rights = [None] * 3 outpad_lefts[0] = pad_left - model_encoder.pad_left outpad_rights[0] = pad_right - model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1] - model_encoder.pad_left outpad_rights[2] = outpad_rights[1] - model_encoder.pad_right for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) mcep = np.array(feat_org[:, -model_decoder.out_dim:]) with torch.no_grad(): feat = torch.FloatTensor(feat_org).cuda().unsqueeze(0) feat_excit = feat[:, :, :config.excit_dim] if config.ar_enc: spk_logits, lat_src, _, _ = model_encoder(F.pad(feat.transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2), \ yz_in=yz_in) else: spk_logits, lat_src, _ = model_encoder( F.pad(feat.transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2)) idx_vq = nn_search_batch(lat_src, model_vq.weight) lat_src = model_vq(idx_vq) if outpad_rights[0] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:-outpad_rights[0]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_src.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_src, _, _ = model_decoder(src_code, lat_src, x_in=x_in) else: cvmcep_src, _ = model_decoder(src_code, lat_src) if config.ar_enc: spk_logits, lat_rec, _, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2), yz_in=yz_in) else: spk_logits, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2)) idx_vq = nn_search_batch(lat_rec, model_vq.weight) lat_rec = model_vq(idx_vq) if outpad_rights[2] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:-outpad_rights[2]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("rec vq") logging.info(dict(zip(unique, counts))) logging.info('rec spkpost') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_rec.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_cyc, _, _ = model_decoder(src_code, lat_rec, x_in=x_in) else: cvmcep_cyc, _ = model_decoder(src_code, lat_rec) if outpad_rights[1] > 0: cvmcep_src = cvmcep_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvmcep_src = cvmcep_src[:, outpad_lefts[1]:] feat_rec = torch.cat((feat_excit, cvmcep_src), 2)[0].cpu().data.numpy() feat_cyc = torch.cat((feat_excit, cvmcep_cyc), 2)[0].cpu().data.numpy() cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep_src.shape) logging.info(cvmcep_cyc.shape) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist)) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist_dv)) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def feature_extract(cpu, wav_list, arr, max_frame_list, max_spc_frame_list): n_wav = len(wav_list) n_sample = 0 n_frame = 0 max_frame = 0 max_spc_frame = 0 count = 1 melfb_t = np.linalg.pinv( librosa.filters.mel(args.fs, args.fftl, n_mels=args.mel_dim)) for wav_name in wav_list: # load wavfile and apply low cut filter fs, x = read_wav(wav_name, cutoff=args.highpass_cutoff) n_sample += x.shape[0] logging.info("cpu-" + str(cpu + 1) + " " + str(len(wav_list)) + " " + wav_name + " " + str(x.shape[0]) + " " + str(n_sample) + " " + str(count)) logging.info(wav_list) # check sampling frequency if not fs == args.fs: logging.info("ERROR: sampling frequency is not matched.") sys.exit(1) hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") if not args.init: if args.minf0 != 40 and args.maxf0 != 700: time_axis_range, f0_range, spc_range, ap_range = analyze_range(x, fs=fs, minf0=args.minf0, \ maxf0=args.maxf0, fperiod=args.shiftms, fftl=args.fftl) else: logging.info('open spk') time_axis_range, f0_range, spc_range, ap_range = analyze( x, fs=fs, fperiod=args.shiftms, fftl=args.fftl) write_hdf5(hdf5name, "/f0_range", f0_range) write_hdf5(hdf5name, "/time_axis", time_axis_range) melmagsp = melsp(x, n_mels=args.mel_dim, n_fft=args.fftl, shiftms=args.shiftms, winms=args.winms, fs=fs) logging.info(melmagsp.shape) write_hdf5(hdf5name, "/log_1pmelmagsp", np.log(1 + 10000 * melmagsp)) uv_range, cont_f0_range = convert_continuos_f0( np.array(f0_range)) unique, counts = np.unique(uv_range, return_counts=True) logging.info(dict(zip(unique, counts))) cont_f0_lpf_range = \ low_pass_filter(cont_f0_range, int(1.0 / (args.shiftms * 0.001)), cutoff=20) mcep_range = ps.sp2mc(spc_range, args.mcep_dim, args.mcep_alpha) npow_range = spc2npow(spc_range) _, spcidx_range = extfrm(mcep_range, npow_range, power_threshold=args.pow) codeap_range = pw.code_aperiodicity(ap_range, fs) cont_f0_lpf_range = np.expand_dims(cont_f0_lpf_range, axis=-1) uv_range = np.expand_dims(uv_range, axis=-1) unique, counts = np.unique(uv_range, return_counts=True) logging.info(dict(zip(unique, counts))) feat_orglf0 = np.c_[uv_range, np.log(cont_f0_lpf_range), codeap_range, mcep_range] logging.info(feat_orglf0.shape) write_hdf5(hdf5name, "/feat_org_lf0", feat_orglf0) write_hdf5(hdf5name, "/spcidx_range", spcidx_range) logging.info(hdf5name) n_codeap = codeap_range.shape[-1] for i in range(n_codeap): logging.info('codeap: %d' % (i + 1)) uv_codeap_i, cont_codeap_i = convert_continuos_codeap( np.array(codeap_range[:, i])) cont_codeap_i = np.log( -np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP)) if i > 0: cont_codeap = np.c_[ cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)] else: uv_codeap = np.expand_dims(uv_codeap_i, axis=-1) cont_codeap = np.expand_dims(cont_codeap_i, axis=-1) uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1) unique, counts = np.unique(uv_codeap_i, return_counts=True) logging.info(dict(zip(unique, counts))) logging.info((uv_range == uv_codeap_i).all()) logging.info((uv_codeap == uv_codeap_i).all()) logging.info(uv_codeap.shape) logging.info(cont_codeap.shape) feat_mceplf0cap = np.c_[uv_range, np.log(cont_f0_lpf_range), uv_codeap, cont_codeap, mcep_range] logging.info(feat_mceplf0cap.shape) write_hdf5(hdf5name, "/feat_mceplf0cap", feat_mceplf0cap) n_frame += feat_orglf0.shape[0] if max_frame < feat_orglf0.shape[0]: max_frame = feat_orglf0.shape[0] if max_spc_frame < spcidx_range[0].shape[0]: max_spc_frame = spcidx_range[0].shape[0] if args.highpass_cutoff != 0 and args.wavfiltdir is not None: sf.write( args.wavfiltdir + "/" + os.path.basename(wav_name), x, fs, 'PCM_16') wavpath = args.wavdir + "/" + os.path.basename(wav_name) logging.info("cpu-" + str(cpu + 1) + " " + wavpath) sp_rec = ps.mc2sp(mcep_range, args.mcep_alpha, args.fftl) wav = np.clip(pw.synthesize(f0_range, sp_rec, ap_range, fs, frame_period=args.shiftms), \ -1, 1) logging.info(wavpath) sf.write(wavpath, wav, fs, 'PCM_16') recmagsp = np.matmul(melfb_t, melmagsp.T) hop_length = int((args.fs / 1000) * args.shiftms) win_length = int((args.fs / 1000) * args.winms) wav = np.clip( librosa.core.griffinlim(recmagsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 1) wavpath = args.wavgfdir + "/" + os.path.basename(wav_name) logging.info(wavpath) sf.write(wavpath, wav, fs, 'PCM_16') else: time_axis, f0, spc, ap = analyze(x, fs=fs, fperiod=args.shiftms, fftl=args.fftl) write_hdf5(hdf5name, "/f0", f0) npow = spc2npow(spc) write_hdf5(hdf5name, "/npow", npow) n_frame += f0.shape[0] if max_frame < f0.shape[0]: max_frame = f0.shape[0] count += 1 arr[0] += n_wav arr[1] += n_sample arr[2] += n_frame max_frame_list.append(max_frame) max_spc_frame_list.append(max_spc_frame) if (n_wav > 0): logging.info(str(arr[0])+" "+str(n_wav)+" "+str(arr[1])+" "+str(n_sample/n_wav)+" "+\ str(arr[2])+" "+str(n_frame/n_wav)+" max_frame = "+str(max_frame)+" max_spc_frame = "+str(max_spc_frame))
def decode_RNN(feat_list, gpu, cvlist=None, mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\ mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\ mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \ lat_dist_rmse_list=None, lat_dist_cosim_list=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder) model_post = GRU_POST_NET( spec_dim=config.mcep_dim, excit_dim=2, n_spk=n_spk, hidden_layers=config.hidden_layers_post, hidden_units=config.hidden_units_post, kernel_size=config.kernel_size_post, dilation_size=config.dilation_size_post, causal_conv=config.causal_conv_post, pad_first=True, right_size=config.right_size_post) #excit_dim=config.excit_dim, #excit_dim=None, logging.info(model_post) model_encoder.load_state_dict(torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict(torch.load(args.model)["model_decoder"]) model_post.load_state_dict(torch.load(args.model)["model_post"]) model_encoder.remove_weight_norm() model_decoder.remove_weight_norm() model_post.remove_weight_norm() model_encoder.cuda() model_decoder.cuda() model_post.cuda() model_encoder.eval() model_decoder.eval() model_post.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_post.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2 pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2 outpad_lefts = [None]*5 outpad_rights = [None]*5 outpad_lefts[0] = pad_left-model_encoder.pad_left outpad_rights[0] = pad_right-model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left outpad_rights[2] = outpad_rights[1]-model_post.pad_right outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right logging.info(f'{pad_left} {pad_right}') logging.info(outpad_lefts) logging.info(outpad_rights) for feat_file in feat_list: # convert mcep spk_src = os.path.basename(os.path.dirname(feat_file)) src_idx = spk_list.index(spk_src) logging.info('%s --> %s' % (spk_src, args.spk_trg)) file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file)) trg_exist = False if os.path.exists(file_trg): logging.info('exist: %s' % (file_trg)) feat_trg = read_hdf5(file_trg, config.string_path) mcep_trg = feat_trg[:,-config.mcep_dim:] logging.info(mcep_trg.shape) trg_exist = True feat_org = read_hdf5(feat_file, config.string_path) mcep = np.array(feat_org[:,-config.mcep_dim:]) codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim]))) sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl)) ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl) feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1) feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]] logging.info("generate") with torch.no_grad(): feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2) feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0) feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0) spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1)) if trg_exist: spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \ (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False) logging.info('target spkpost') logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1)) cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src) cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src) cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(), e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), sampling=False) logging.info('cv spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv) cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) if outpad_rights[2] > 0: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) else: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64) if trg_exist: if outpad_rights[0] > 0: lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]] else: lat_src = lat_src[:,outpad_lefts[0]:] logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) logging.info(cvmcep_cyc.shape) if trg_exist: logging.info(lat_src.shape) logging.info(lat_trg.shape) cvlist.append(np.var(cvmcep[:,1:], axis=0)) logging.info("cvf0lin") f0_range = read_hdf5(feat_file, "/f0_range") cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std) uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin)) unique, counts = np.unique(uv_range_lin, return_counts=True) logging.info(dict(zip(unique, counts))) cont_f0_lpf_range_lin = \ low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20) uv_range_lin = np.expand_dims(uv_range_lin, axis=-1) cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1) # plain converted feat for neural vocoder feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep] logging.info(feat_cv.shape) logging.info("mcd acc") spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) if trg_exist: spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0]) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) spcidx_src = torch.LongTensor(spcidx).cuda() spcidx_trg = torch.LongTensor(spcidx_trg).cuda() trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64) trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64) aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg) lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0))) _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0) aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src) lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0))) _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0) logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc)) lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2 lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2 lat_dist_rmse_list.append(lat_dist_rmse) lat_dist_cosim_list.append(lat_dist_cosim) logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_cyc.append(mcdpow_mean) mcdpowstd_cvlist_cyc.append(mcdpow_std) mcd_cvlist_cyc.append(mcd_mean) mcdstd_cvlist_cyc.append(mcd_std) logging.info("synth anasyn") wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco rec") cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl) logging.info(cvsp_src.shape) wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv") cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl) logging.info(cvsp.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv GV") datamean = np.mean(cvmcep[:,1:], axis=0) cvmcep_gv = np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \ (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]] cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN) cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl) logging.info(cvsp_gv.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) #logging.info("synth diffGV") #shiftl = int(args.fs/1000*args.shiftms) #mc_cv_diff = cvmcep_gv-mcep #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha) #logging.info(b.shape) #assert np.isfinite(b).all #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl) #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav"))) #assert(fs_ == args.fs) #wav = mlsa_fil.synthesis(x, b) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("synth diffGVF0") #time_axis = read_hdf5(feat_file, "/time_axis") #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff.shape) #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(ap_diff.shape) #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("analysis diffGVF0") #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff_anasyn.shape) #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha) #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs) ## convert to continouos codeap with uv #for i in range(code_ap_diff_anasyn.shape[-1]): # logging.info('codeap: %d' % (i+1)) # uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i])) # cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP)) # if i > 0: # cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)] # else: # uv_codeap = np.expand_dims(uv_codeap_i, axis=-1) # cont_codeap = np.expand_dims(cont_codeap_i, axis=-1) # uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1) # unique, counts = np.unique(uv_codeap_i, return_counts=True) # logging.info(dict(zip(unique, counts))) ## postprocessed converted feat for neural vocoder #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn] #logging.info("write lat") #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file))) #if not os.path.exists(outTxtDir): # os.mkdir(outTxtDir) #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt")) #logging.info(outTxt) #g = open(outTxt, "wt") #idx_frm = 0 #nfrm = trj_lat_src.shape[0] #dim = trj_lat_src.shape[1] #if not args.time_flag: ##if True: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # g.write("%lf " % (elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #else: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # if idx_elmt > 1: # g.write("%lf " % (elmt)) # else: # g.write("%lf %lf " % (time_axis[idx_frm], elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #g.close() logging.info('write to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) # cv write_path = args.string_path logging.info(feat_file + ' ' + write_path) logging.info(feat_cv.shape) write_hdf5(feat_file, write_path, feat_cv) ## diffGVF0 #write_path = args.string_path+"_diffgvf0" #logging.info(feat_file + ' ' + write_path) #logging.info(feat_diffgv_anasyn.shape) #write_hdf5(feat_file, write_path, feat_diffgv_anasyn) count += 1
def main(): parser = argparse.ArgumentParser() # decode setting parser.add_argument("--feats", required=True, type=str, help="list or directory of source eval feat files") parser.add_argument("--spk", required=True, type=str, help="speaker name to be reconstructed") parser.add_argument("--model", required=True, type=str, help="model file") parser.add_argument("--config", required=True, type=str, help="configure file") parser.add_argument("--n_gpus", default=1, type=int, help="number of gpus") parser.add_argument("--outdir", required=True, type=str, help="directory to save log") parser.add_argument("--string_path", required=True, type=str, help="path of h5 generated feature") # other setting parser.add_argument("--GPU_device", default=None, type=int, help="selection of GPU device") parser.add_argument("--GPU_device_str", default=None, type=str, help="selection of GPU device") parser.add_argument("--verbose", default=1, type=int, help="log level") args = parser.parse_args() if args.GPU_device is not None or args.GPU_device_str is not None: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if args.GPU_device_str is None: os.environ["CUDA_VISIBLE_DEVICES"] = str(args.GPU_device) else: os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU_device_str # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # set log level if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # load config config = torch.load(args.config) # get source feat list if os.path.isdir(args.feats): feat_list = sorted(find_files(args.feats, "*.h5")) elif os.path.isfile(args.feats): feat_list = read_txt(args.feats) else: logging.error("--feats should be directory or list.") sys.exit(1) # prepare the file list for parallel decoding feat_lists = np.array_split(feat_list, args.n_gpus) feat_lists = [f_list.tolist() for f_list in feat_lists] for i in range(args.n_gpus): logging.info('%d: %d' % (i + 1, len(feat_lists[i]))) spk_list = config.spk_list.split('@') n_spk = len(spk_list) spk_idx = spk_list.index(args.spk) stats_list = config.stats_list.split('@') assert (n_spk == len(stats_list)) spk_stat = stats_list[spk_idx] gv_mean = read_hdf5(spk_stat, "/gv_melsp_mean") model_epoch = os.path.basename(args.model).split('.')[0].split('-')[1] logging.info('epoch: ' + model_epoch) model_name = os.path.basename(os.path.dirname(args.model)).split('_')[1] logging.info('mdl_name: ' + model_name) logging.info(config) # define gpu decode function def gpu_decode(feat_list, gpu, cvlist=None, lsd_cvlist=None, lsdstd_cvlist=None, cvlist_dv=None, lsd_cvlist_dv=None, lsdstd_cvlist_dv=None, f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, cvlist_cyc=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, cvlist_cyc_dv=None, lsd_cvlist_cyc_dv=None, lsdstd_cvlist_cyc_dv=None, f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, excit_dim=config.excit_dim, out_dim=config.mel_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=False, ar=False, pad_first=True, right_size=config.right_size_lf0) logging.info(model_decoder_excit) if (config.spkidtr_dim > 0): model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict( torch.load(args.model)["model_decoder_excit"]) if (config.spkidtr_dim > 0): model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() if (config.spkidtr_dim > 0): model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_decoder_excit.eval() if (config.spkidtr_dim > 0): model_spkidtr.eval() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False if (config.spkidtr_dim > 0): for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_excit.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_excit.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_excit.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_excit.pad_right outpad_lefts[2] = outpad_lefts[1] - model_decoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_decoder_melsp.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder_melsp.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder_melsp.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder_excit.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder_excit.pad_right for feat_file in feat_list: # reconst. melsp logging.info("recmelsp " + feat_file) feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long() cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_src_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_src[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_src_e[:, model_decoder_excit.pad_left:], lat_src[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_src, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_src[:, :, :config.excit_dim]) spk_logits, _, lat_rec, _ = model_encoder_melsp( cvmelsp_src, sampling=False) spk_logits_e, _, lat_rec_e, _ = model_encoder_excit( cvmelsp_src, sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long() cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_rec_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_rec[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_rec_e[:, model_decoder_excit.pad_left:], lat_rec[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_cyc, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_cyc[:, :, :config.excit_dim]) if outpad_rights[1] > 0: cvlf0_src = cvlf0_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvlf0_src = cvlf0_src[:, outpad_lefts[1]:] if outpad_rights[2] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]: -outpad_rights[2]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:] if outpad_rights[4] > 0: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]: -outpad_rights[4]] else: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:] feat_rec = cvmelsp_src[0].cpu().data.numpy() feat_cyc = cvmelsp_cyc[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmelsp_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) feat_world = read_hdf5(feat_file, "/feat_mceplf0cap") f0 = np.array( np.rint(feat_world[:, 0]) * np.exp(feat_world[:, 1])) codeap = np.array( np.rint(feat_world[:, 2:3]) * (-np.exp(feat_world[:, 3:config.full_excit_dim]))) cvf0_src = np.array( np.rint(cvlf0_src[:, 0]) * np.exp(cvlf0_src[:, 1])) cvcodeap_src = np.array( np.rint(cvlf0_src[:, 2:3]) * (-np.exp(cvlf0_src[:, 3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src - f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt( np.mean((cvcodeap_src - codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i + 1, codeap_rmse[i])) cvf0_cyc = np.array( np.rint(cvlf0_cyc[:, 0]) * np.exp(cvlf0_cyc[:, 1])) cvcodeap_cyc = np.array( np.rint(cvlf0_cyc[:, 2:3]) * (-np.exp(cvlf0_cyc[:, 3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc - f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt( np.mean((cvcodeap_cyc - codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i + 1, codeap_rmse_cyc[i])) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_rec: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) cvlist.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) cvlist_cyc.append(np.var(melsp_cyc_rest, axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) lsd_cvlist_dv.append(lsd_mean) lsdstd_cvlist_dv.append(lsd_std) cvlist_dv.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) lsd_cvlist_cyc_dv.append(lsd_mean_cyc) lsdstd_cvlist_cyc_dv.append(lsd_std_cyc) cvlist_cyc_dv.append(np.var(melsp_cyc_rest, axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1 #if count >= 5: # break # parallel decode training with mp.Manager() as manager: gpu = 0 processes = [] cvlist = manager.list() lsd_cvlist = manager.list() lsdstd_cvlist = manager.list() f0rmse_cvlist = manager.list() f0corr_cvlist = manager.list() caprmse_cvlist = manager.list() cvlist_dv = manager.list() lsd_cvlist_dv = manager.list() lsdstd_cvlist_dv = manager.list() f0rmse_cvlist_dv = manager.list() f0corr_cvlist_dv = manager.list() caprmse_cvlist_dv = manager.list() cvlist_cyc = manager.list() lsd_cvlist_cyc = manager.list() lsdstd_cvlist_cyc = manager.list() f0rmse_cvlist_cyc = manager.list() f0corr_cvlist_cyc = manager.list() caprmse_cvlist_cyc = manager.list() cvlist_cyc_dv = manager.list() lsd_cvlist_cyc_dv = manager.list() lsdstd_cvlist_cyc_dv = manager.list() f0rmse_cvlist_cyc_dv = manager.list() f0corr_cvlist_cyc_dv = manager.list() caprmse_cvlist_cyc_dv = manager.list() for i, feat_list in enumerate(feat_lists): logging.info(i) p = mp.Process(target=gpu_decode, args=( feat_list, gpu, cvlist, lsd_cvlist, lsdstd_cvlist, cvlist_dv, lsd_cvlist_dv, lsdstd_cvlist_dv, f0rmse_cvlist, f0corr_cvlist, caprmse_cvlist, f0rmse_cvlist_dv, f0corr_cvlist_dv, caprmse_cvlist_dv, cvlist_cyc, lsd_cvlist_cyc, lsdstd_cvlist_cyc, cvlist_cyc_dv, lsd_cvlist_cyc_dv, lsdstd_cvlist_cyc_dv, f0rmse_cvlist_cyc, f0corr_cvlist_cyc, caprmse_cvlist_cyc, f0rmse_cvlist_cyc_dv, f0corr_cvlist_cyc_dv, caprmse_cvlist_cyc_dv, )) p.start() processes.append(p) gpu += 1 if (i + 1) % args.n_gpus == 0: gpu = 0 # wait for all process for p in processes: p.join() # calculate cv_gv statistics if len(lsd_cvlist) > 0: logging.info("lsd_rec: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(lsd_cvlist)), \ np.std(np.array(lsd_cvlist)),np.mean(np.array(lsdstd_cvlist)),\ np.std(np.array(lsdstd_cvlist)))) cvgv_mean = np.mean(np.array(cvlist), axis=0) cvgv_var = np.var(np.array(cvlist), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("f0rmse_rec: %.6f Hz (+- %.6f)" % (np.mean( np.array(f0rmse_cvlist)), np.std(np.array(f0rmse_cvlist)))) logging.info("f0corr_rec: %.6f (+- %.6f)" % (np.mean( np.array(f0corr_cvlist)), np.std(np.array(f0corr_cvlist)))) caprmse_cvlist = np.array(caprmse_cvlist) for i in range(caprmse_cvlist.shape[-1]): logging.info("caprmse-%d_rec: %.6f dB (+- %.6f)" % (i + 1, np.mean(caprmse_cvlist[:, i]), np.std(caprmse_cvlist[:, i]))) logging.info("lsd_cyc: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(lsd_cvlist_cyc)), \ np.std(np.array(lsd_cvlist_cyc)),np.mean(np.array(lsdstd_cvlist_cyc)),\ np.std(np.array(lsdstd_cvlist_cyc)))) cvgv_mean = np.mean(np.array(cvlist_cyc), axis=0) cvgv_var = np.var(np.array(cvlist_cyc), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("f0rmse_cyc: %.6f Hz (+- %.6f)" % (np.mean(np.array(f0rmse_cvlist_cyc)), np.std(np.array(f0rmse_cvlist_cyc)))) logging.info("f0corr_cyc: %.6f (+- %.6f)" % (np.mean(np.array(f0corr_cvlist_cyc)), np.std(np.array(f0corr_cvlist_cyc)))) caprmse_cvlist_cyc = np.array(caprmse_cvlist_cyc) for i in range(caprmse_cvlist_cyc.shape[-1]): logging.info("caprmse-%d_cyc: %.6f dB (+- %.6f)" % (i + 1, np.mean(caprmse_cvlist_cyc[:, i]), np.std(caprmse_cvlist_cyc[:, i]))) cvgv_mean = np.mean(np.array(np.r_[cvlist, cvlist_cyc]), axis=0) cvgv_var = np.var(np.array(np.r_[cvlist, cvlist_cyc]), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) string_path = model_name+"-"+str(config.n_half_cyc)+"-"+str(config.lat_dim)+"-"+str(config.lat_dim_e)\ +"-"+str(config.spkidtr_dim)+"-"+model_epoch logging.info(string_path) string_mean = "/recgv_mean_" + string_path string_var = "/recgv_var_" + string_path write_hdf5(spk_stat, string_mean, cvgv_mean) write_hdf5(spk_stat, string_var, cvgv_var) if len(lsd_cvlist_dv) > 0: logging.info("lsd_rec_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(lsd_cvlist_dv)), \ np.std(np.array(lsd_cvlist_dv)),np.mean(np.array(lsdstd_cvlist_dv)),\ np.std(np.array(lsdstd_cvlist_dv)))) cvgv_mean = np.mean(np.array(cvlist_dv), axis=0) cvgv_var = np.var(np.array(cvlist_dv), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("f0rmse_rec_dv: %.6f Hz (+- %.6f)" % (np.mean(np.array(f0rmse_cvlist_dv)), np.std(np.array(f0rmse_cvlist_dv)))) logging.info("f0corr_rec_dv: %.6f (+- %.6f)" % (np.mean(np.array(f0corr_cvlist_dv)), np.std(np.array(f0corr_cvlist_dv)))) caprmse_cvlist_dv = np.array(caprmse_cvlist_dv) for i in range(caprmse_cvlist.shape[-1]): logging.info("caprmse-%d_rec_dv: %.6f dB (+- %.6f)" % (i + 1, np.mean(caprmse_cvlist_dv[:, i]), np.std(caprmse_cvlist_dv[:, i]))) logging.info("lsd_cyc_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(lsd_cvlist_cyc_dv)), \ np.std(np.array(lsd_cvlist_cyc_dv)),np.mean(np.array(lsdstd_cvlist_cyc_dv)),\ np.std(np.array(lsdstd_cvlist_cyc_dv)))) cvgv_mean = np.mean(np.array(cvlist_cyc_dv), axis=0) cvgv_var = np.var(np.array(cvlist_cyc_dv), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("f0rmse_cyc_dv: %.6f Hz (+- %.6f)" % (np.mean(np.array(f0rmse_cvlist_cyc_dv)), np.std(np.array(f0rmse_cvlist_cyc_dv)))) logging.info("f0corr_cyc_dv: %.6f (+- %.6f)" % (np.mean(np.array(f0corr_cvlist_cyc_dv)), np.std(np.array(f0corr_cvlist_cyc_dv)))) caprmse_cvlist_cyc_dv = np.array(caprmse_cvlist_cyc_dv) for i in range(caprmse_cvlist_cyc_dv.shape[-1]): logging.info("caprmse-%d_cyc_dv: %.6f dB (+- %.6f)" % (i + 1, np.mean(caprmse_cvlist_cyc_dv[:, i]), np.std(caprmse_cvlist_cyc_dv[:, i])))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, help="filename of hdf5 format") parser.add_argument("--expdir", required=True, type=str, help="directory to save the log") parser.add_argument("--stdim", default=5, type=int, help="directory to save the log") parser.add_argument("--spkr", default=None, type=str, help="directory to save the log") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # read list and define scaler filenames = read_txt(args.feats) scaler_feat_org_lf0 = StandardScaler() logging.info("number of training utterances = " + str(len(filenames))) #var = [] var_range = [] f0s_range = np.empty((0)) # process over all of data for filename in filenames: logging.info(filename) feat_org_lf0 = read_hdf5(filename, "/feat_org_lf0") scaler_feat_org_lf0.partial_fit(feat_org_lf0) mcep_range = feat_org_lf0[:, args.stdim:] var_range.append(np.var(mcep_range, axis=0)) logging.info(mcep_range.shape) if check_hdf5(filename, "/f0_range"): f0_range = read_hdf5(filename, "/f0_range") else: f0_range = read_hdf5(filename, "/f0") nonzero_indices = np.nonzero(f0_range) logging.info(f0_range[nonzero_indices].shape) logging.info(f0s_range.shape) f0s_range = np.concatenate([f0s_range, f0_range[nonzero_indices]]) logging.info(f0s_range.shape) mean_feat_org_lf0 = scaler_feat_org_lf0.mean_ scale_feat_org_lf0 = scaler_feat_org_lf0.scale_ gv_range_mean = np.mean(np.array(var_range), axis=0) gv_range_var = np.var(np.array(var_range), axis=0) logging.info(gv_range_mean) logging.info(gv_range_var) f0_range_mean = np.mean(f0s_range) f0_range_std = np.std(f0s_range) logging.info(f0_range_mean) logging.info(f0_range_std) lf0_range_mean = np.mean(np.log(f0s_range)) lf0_range_std = np.std(np.log(f0s_range)) logging.info(lf0_range_mean) logging.info(lf0_range_std) logging.info(np.array_equal(f0_range_mean, np.exp(lf0_range_mean))) logging.info(np.array_equal(f0_range_std, np.exp(lf0_range_std))) logging.info(mean_feat_org_lf0) logging.info(scale_feat_org_lf0) write_hdf5(args.stats, "/mean_feat_org_lf0", mean_feat_org_lf0) write_hdf5(args.stats, "/scale_feat_org_lf0", scale_feat_org_lf0) write_hdf5(args.stats, "/gv_range_mean", gv_range_mean) write_hdf5(args.stats, "/gv_range_var", gv_range_var) write_hdf5(args.stats, "/f0_range_mean", f0_range_mean) write_hdf5(args.stats, "/f0_range_std", f0_range_std) write_hdf5(args.stats, "/lf0_range_mean", lf0_range_mean) write_hdf5(args.stats, "/lf0_range_std", lf0_range_std)
modulo_index = 0 if len(sys.argv) > 3: modulo_index = max(0, int(sys.argv[3])) print('[Data] modulo index %d' % modulo_index) config_files = [config_file for config_file in os.listdir(config_folder)] config = utils.read_json(config_folder + config_files[-1]) scaled_directory = config['scaled_directory'] + '/' assert os.path.exists( scaled_directory), 'directory %s does not exist' % scaled_directory depth_directory = config['depth_directory'] + '/' utils.makedir(depth_directory) off_files = utils.read_ordered_directory(scaled_directory) timer = Timer() Rs = get_views(config['watertight_rendering']['n_views']) for n in range(len(off_files)): if (n - modulo_index) % modulo_base == 0: timer.reset() mesh = Mesh.from_off(off_files[n]) depths = render(mesh, Rs) depth_file = depth_directory + '%d.hdf5' % n utils.write_hdf5(depth_file, np.array(depths)) print('[Data] wrote %s (%f seconds)' % (depth_file, timer.elapsed()))
truncation = config['truncation'] sdfs = utils.read_hdf5(common.filename(config, 'sdf_file')) tsdfs = sdfs.copy() tsdfs[tsdfs > truncation] = truncation tsdfs[tsdfs < -truncation] = -truncation ltsdfs = tsdfs.copy() ltsdfs[ltsdfs > 0] = np.log(ltsdfs[ltsdfs > 0] + 1) ltsdfs[ltsdfs < 0] = -np.log(np.abs(ltsdfs[ltsdfs < 0]) + 1) tsdf_file = common.filename(config, 'tsdf_file') ltsdf_file = common.filename(config, 'ltsdf_file') utils.write_hdf5(tsdf_file, tsdfs) print('[Data] wrote ' + tsdf_file) utils.write_hdf5(ltsdf_file, ltsdfs) print('[Data] wrote ' + ltsdf_file) config_files = [ config_file for config_file in os.listdir(config_folder) if config_file.find('prior') < 0 ] for config_file in config_files: print('[Data] reading ' + config_folder + config_file) config = utils.read_json(config_folder + config_file) input_sdfs = utils.read_hdf5(common.filename(config, 'input_sdf_file')) input_tsdfs = input_sdfs.copy()
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None): with torch.cuda.device(gpu): mean_jnt = torch.FloatTensor( read_hdf5(args.stats_jnt, "/mean_feat_org_lf0")[config.stdim:]).cuda() std_jnt = torch.FloatTensor( read_hdf5(args.stats_jnt, "/scale_feat_org_lf0")[config.stdim:]).cuda() # define model and load parameters logging.info("model") logging.info(config) with torch.no_grad(): model_encoder = GRU_RNN_STOCHASTIC( in_dim=config.in_dim, out_dim=config.lat_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size, dilation_size=config.dilation_size, spk_dim=n_spk, scale_out_flag=False) model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk, out_dim=config.out_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size, dilation_size=config.dilation_size, scale_in_flag=False) logging.info(model_encoder) logging.info(model_decoder) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_encoder.cuda() model_decoder.cuda() model_encoder.eval() model_decoder.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk)) y_in_pp = torch.FloatTensor(init_pp).cuda() y_in_src = torch.unsqueeze( torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0) for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat = read_hdf5(feat_file, "/feat_org_lf0") logging.info(feat.shape) with torch.no_grad(): lat_feat, _, _, _, _ = model_encoder(torch.FloatTensor(feat).cuda(), \ y_in_pp, sampling=False) spk_code = np.zeros((lat_feat.shape[0], n_spk)) spk_code[:, spk_code_idx] = 1 spk_code = torch.FloatTensor(spk_code).cuda() cvmcep, _, _ = model_decoder( torch.cat((spk_code, lat_feat), 1), y_in_src) cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep.shape) mcep = feat[:, config.stdim:] spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \ np.array(cvmcep[np.array(spcidx),:], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \ np.array(cvmcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep[:, 1:], axis=0)) logging.info(len(cvlist)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep[:, 1:], axis=0)) logging.info(len(cvlist_dv)) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) logging.info("mod_pow") cvmcep = mod_pow(cvmcep, mcep, alpha=args.mcep_alpha, irlen=IRLEN) logging.info(cvmcep.shape) feat_cvmcep = np.c_[feat[:, :config.stdim], cvmcep] logging.info(feat_cvmcep.shape) write_path = '/feat_recmcep_cycvae-' + model_epoch outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), spk + "-" + spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + write_path) write_hdf5(feat_file, write_path, feat_cvmcep)
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim + config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder) model_post = GRU_POST_NET( spec_dim=config.mcep_dim, excit_dim=2, n_spk=n_spk, hidden_layers=config.hidden_layers_post, hidden_units=config.hidden_units_post, kernel_size=config.kernel_size_post, dilation_size=config.dilation_size_post, causal_conv=config.causal_conv_post, pad_first=True, right_size=config.right_size_post) #excit_dim=config.excit_dim, #excit_dim=None, logging.info(model_post) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_post.load_state_dict( torch.load(args.model)["model_post"]) model_encoder.remove_weight_norm() model_decoder.remove_weight_norm() model_post.remove_weight_norm() model_encoder.cuda() model_decoder.cuda() model_post.cuda() model_encoder.eval() model_decoder.eval() model_post.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_post.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left) * 2 pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder.pad_left outpad_rights[0] = pad_right - model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1] - model_post.pad_left outpad_rights[2] = outpad_rights[1] - model_post.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder.pad_right logging.info(f'{pad_left} {pad_right}') logging.info(outpad_lefts) logging.info(outpad_rights) for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) mcep = np.array(feat_org[:, -config.mcep_dim:]) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) feat_excit = torch.FloatTensor( feat_org[:, :config.excit_dim]).cuda().unsqueeze(0) spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) cvmcep_src, _ = model_decoder((torch.ones( (1, lat_src.shape[1])) * spk_idx).cuda().long(), lat_src) cvmcep_src_post, _ = model_post( cvmcep_src, y=(torch.ones( (1, cvmcep_src.shape[1])) * spk_idx).cuda().long(), e=F.pad(feat_excit[:, :, :2].transpose(1, 2), (outpad_lefts[1], outpad_rights[1]), "replicate").transpose(1, 2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) cvmcep_cyc, _ = model_decoder((torch.ones( (1, lat_rec.shape[1])) * spk_idx).cuda().long(), lat_rec) cvmcep_cyc_post, _ = model_post( cvmcep_cyc, y=(torch.ones( (1, cvmcep_cyc.shape[1])) * spk_idx).cuda().long(), e=F.pad(feat_excit[:, :, :2].transpose(1, 2), (outpad_lefts[4], outpad_rights[4]), "replicate").transpose(1, 2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) if outpad_rights[2] > 0: feat_rec = torch.cat( (feat_excit, cvmcep_src_post[:, outpad_lefts[2]:-outpad_rights[2]] ), 2)[0].cpu().data.numpy() else: feat_rec = torch.cat( (feat_excit, cvmcep_src_post[:, outpad_lefts[2]:]), 2)[0].cpu().data.numpy() feat_cyc = torch.cat((feat_excit, cvmcep_cyc_post), 2)[0].cpu().data.numpy() if outpad_rights[2] > 0: cvmcep_src = np.array( cvmcep_src_post[:, outpad_lefts[2]:-outpad_rights[2]] [0].cpu().data.numpy(), dtype=np.float64) else: cvmcep_src = np.array(cvmcep_src_post[:, outpad_lefts[2]:] [0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array( cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep_src.shape) logging.info(cvmcep_cyc.shape) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist)) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist_dv)) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
data = data.reshape((shape[0], np.prod(np.array(shape[1:])))) print('[Validation] reshaped data ' + 'x'.join(map(str, data.shape))) mean_file = args.mean_file V_file = args.V_file var_file = args.var_file mean = utils.read_hdf5(mean_file) print('[Validation] read ' + mean_file) V = utils.read_hdf5(V_file) print('[Validation] read ' + V_file) var = utils.read_hdf5(var_file)[0] print('[Validation] read ' + var_file) print('[Validation] var is ' + str(var)) I = np.eye(V.shape[1]) M = V.T.dot(V) + I*var M_inv = np.linalg.inv(M) means = np.repeat(mean.reshape((mean.shape[0], 1)), data.shape[0], axis = 1) codes = M_inv.dot(V.T.dot(data.T - means)) code_mean = np.mean(codes) code_var = np.var(codes) print('[Validation] codes: ' + str(code_mean) + ' / ' + str(code_var)) preds = np.dot(V, codes) + means preds = preds.T utils.write_hdf5(args.output, preds.reshape(shape)) print('[Validation] wrote ' + args.output)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, help="filename of stats for hdf5 format") parser.add_argument("--expdir", required=True, type=str, help="directory to save the log") parser.add_argument("--mcep_dim", default=50, type=int, help="dimension of mel-cepstrum") parser.add_argument("--n_jobs", default=10, type=int, help="number of parallel jobs") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # read list and define scaler filenames = read_txt(args.feats) logging.info("number of training utterances = " + str(len(filenames))) def calc_stats(filenames, cpu, feat_mceplf0cap_list, feat_orglf0_list, varmcep_list, f0_list, melsp_list, varmelsp_list, melworldsp_list, varmelworldsp_list): feat_mceplf0cap_arr = None feat_orglf0_arr = None varmcep_arr = None f0_arr = None melsp_arr = None varmelsp_arr = None melworldsp_arr = None varmelworldsp_arr = None count = 0 # process over all of data for filename in filenames: logging.info(filename) feat_mceplf0cap = read_hdf5(filename, "/feat_mceplf0cap") logging.info(feat_mceplf0cap.shape) feat_orglf0 = read_hdf5(filename, "/feat_org_lf0") logging.info(feat_orglf0.shape) melsp = read_hdf5(filename, "/log_1pmelmagsp") logging.info(melsp.shape) melworldsp = read_hdf5(filename, "/log_1pmelworldsp") logging.info(melworldsp.shape) if feat_mceplf0cap_arr is not None: feat_mceplf0cap_arr = np.r_[feat_mceplf0cap_arr, feat_mceplf0cap] else: feat_mceplf0cap_arr = feat_mceplf0cap if feat_orglf0_arr is not None: feat_orglf0_arr = np.r_[feat_orglf0_arr, feat_orglf0] else: feat_orglf0_arr = feat_orglf0 logging.info('feat') logging.info(feat_mceplf0cap_arr.shape) logging.info(feat_orglf0_arr.shape) if varmcep_arr is not None: varmcep_arr = np.r_[varmcep_arr, np.var(feat_mceplf0cap[:,-args.mcep_dim:], \ axis=0, keepdims=True)] else: varmcep_arr = np.var(feat_mceplf0cap[:, -args.mcep_dim:], axis=0, keepdims=True) logging.info('var') logging.info(varmcep_arr.shape) logging.info('f0') f0 = read_hdf5(filename, "/f0_range") logging.info(f0.shape) logging.info('f0 > 0') f0 = f0[np.nonzero(f0)] logging.info(f0.shape) if f0_arr is not None: f0_arr = np.r_[f0_arr, f0] else: f0_arr = f0 logging.info(f0_arr.shape) if melsp_arr is not None: melsp_arr = np.r_[melsp_arr, melsp] else: melsp_arr = melsp logging.info(melsp_arr.shape) if varmelsp_arr is not None: varmelsp_arr = np.r_[varmelsp_arr, np.var((np.exp(melsp)-1)/10000, axis=0, \ keepdims=True)] else: varmelsp_arr = np.var((np.exp(melsp) - 1) / 10000, axis=0, keepdims=True) logging.info('var melsp') logging.info(varmelsp_arr.shape) if melworldsp_arr is not None: melworldsp_arr = np.r_[melworldsp_arr, melworldsp] else: melworldsp_arr = melworldsp logging.info(melworldsp_arr.shape) if varmelworldsp_arr is not None: varmelworldsp_arr = np.r_[varmelworldsp_arr, np.var((np.exp(melworldsp)-1)/10000, axis=0, \ keepdims=True)] else: varmelworldsp_arr = np.var((np.exp(melworldsp) - 1) / 10000, axis=0, keepdims=True) logging.info('var melworldsp') logging.info(varmelworldsp_arr.shape) count += 1 logging.info("cpu %d %d %d %d %d %d %d %d %d %d" % (cpu, count, len(feat_mceplf0cap_arr), len(feat_orglf0_arr), len(varmcep_arr), len(f0_arr), len(melsp_arr), len(varmelsp_arr), len(melworldsp_arr), len(varmelworldsp_arr))) #if count >= 5: # break feat_mceplf0cap_list.append(feat_mceplf0cap_arr) feat_orglf0_list.append(feat_orglf0_arr) varmcep_list.append(varmcep_arr) f0_list.append(f0_arr) melsp_list.append(melsp_arr) varmelsp_list.append(varmelsp_arr) melworldsp_list.append(melworldsp_arr) varmelworldsp_list.append(varmelworldsp_arr) # divie list feat_lists = np.array_split(filenames, args.n_jobs) feat_lists = [f_list.tolist() for f_list in feat_lists] for i in range(len(feat_lists)): logging.info("%d %d" % (i + 1, len(feat_lists[i]))) # multi processing with mp.Manager() as manager: processes = [] feat_mceplf0cap_list = manager.list() feat_orglf0_list = manager.list() varmcep_list = manager.list() f0_list = manager.list() melsp_list = manager.list() varmelsp_list = manager.list() melworldsp_list = manager.list() varmelworldsp_list = manager.list() for i, feat_list in enumerate(feat_lists): p = mp.Process(target=calc_stats, args=( feat_list, i + 1, feat_mceplf0cap_list, feat_orglf0_list, varmcep_list, f0_list, melsp_list, varmelsp_list, melworldsp_list, varmelworldsp_list, )) p.start() processes.append(p) # wait for all process for p in processes: p.join() feat_mceplf0cap = None for i in range(len(feat_mceplf0cap_list)): if feat_mceplf0cap_list[i] is not None: logging.info(i) logging.info(feat_mceplf0cap_list[i].shape) if feat_mceplf0cap is not None: feat_mceplf0cap = np.r_[feat_mceplf0cap, feat_mceplf0cap_list[i]] else: feat_mceplf0cap = feat_mceplf0cap_list[i] logging.info('feat mceplf0cap: %d' % (len(feat_mceplf0cap))) logging.info(feat_mceplf0cap.shape) feat_orglf0 = None for i in range(len(feat_orglf0_list)): if feat_orglf0_list[i] is not None: logging.info(i) logging.info(feat_orglf0_list[i].shape) if feat_orglf0 is not None: feat_orglf0 = np.r_[feat_orglf0, feat_orglf0_list[i]] else: feat_orglf0 = feat_orglf0_list[i] logging.info('feat orglf0: %d' % (len(feat_orglf0))) logging.info(feat_orglf0.shape) var_range = None for i in range(len(varmcep_list)): if varmcep_list[i] is not None: logging.info(i) logging.info(varmcep_list[i].shape) if var_range is not None: var_range = np.r_[var_range, varmcep_list[i]] else: var_range = varmcep_list[i] logging.info('var mcep: %d' % (len(var_range))) logging.info(var_range.shape) f0s_range = None for i in range(len(f0_list)): if f0_list[i] is not None: logging.info(i) logging.info(f0_list[i].shape) if f0s_range is not None: f0s_range = np.r_[f0s_range, f0_list[i]] else: f0s_range = f0_list[i] logging.info('f0: %d' % (len(f0s_range))) logging.info(f0s_range.shape) melsp = None for i in range(len(melsp_list)): if melsp_list[i] is not None: logging.info(i) logging.info(melsp_list[i].shape) if melsp is not None: melsp = np.r_[melsp, melsp_list[i]] else: melsp = melsp_list[i] logging.info('melsp: %d' % (len(melsp))) logging.info(melsp.shape) var_melsp = None for i in range(len(varmelsp_list)): if varmelsp_list[i] is not None: logging.info(i) logging.info(varmelsp_list[i].shape) if var_melsp is not None: var_melsp = np.r_[var_melsp, varmelsp_list[i]] else: var_melsp = varmelsp_list[i] logging.info('var melsp: %d' % (len(var_melsp))) logging.info(var_melsp.shape) melworldsp = None for i in range(len(melworldsp_list)): if melworldsp_list[i] is not None: logging.info(i) logging.info(melworldsp_list[i].shape) if melworldsp is not None: melworldsp = np.r_[melworldsp, melworldsp_list[i]] else: melworldsp = melworldsp_list[i] logging.info('melworldsp: %d' % (len(melworldsp))) logging.info(melworldsp.shape) var_melworldsp = None for i in range(len(varmelworldsp_list)): if varmelworldsp_list[i] is not None: logging.info(i) logging.info(varmelworldsp_list[i].shape) if var_melworldsp is not None: var_melworldsp = np.r_[var_melworldsp, varmelworldsp_list[i]] else: var_melworldsp = varmelworldsp_list[i] logging.info('var melworldsp: %d' % (len(var_melworldsp))) logging.info(var_melworldsp.shape) scaler_feat_mceplf0cap = StandardScaler() scaler_feat_orglf0 = StandardScaler() logging.info(feat_mceplf0cap.shape) #min_mcep = np.min(feat_mceplf0cap[:,-args.mcep_dim:], axis=0) #max_mcep = np.max(feat_mceplf0cap[:,-args.mcep_dim:], axis=0) #logging.info(min_mcep) #logging.info(max_mcep) #write_hdf5(args.stats, "/min_mcep", min_mcep) #write_hdf5(args.stats, "/max_mcep", max_mcep) scaler_feat_mceplf0cap.partial_fit(feat_mceplf0cap) scaler_feat_orglf0.partial_fit(feat_orglf0) logging.info(melsp.shape) #min_melsp = np.min(melsp, axis=0) #max_melsp = np.max(melsp, axis=0) #logging.info(min_melsp) #logging.info(max_melsp) #write_hdf5(args.stats, "/min_melsp", min_melsp) #write_hdf5(args.stats, "/max_melsp", max_melsp) scaler_melsp = StandardScaler() scaler_melsp.partial_fit(melsp) mean_feat_mceplf0cap = scaler_feat_mceplf0cap.mean_ scale_feat_mceplf0cap = scaler_feat_mceplf0cap.scale_ #logging.info("mcep_bound") #min_mcep_bound = min_mcep-scale_feat_mceplf0cap[-args.mcep_dim:] #max_mcep_bound = max_mcep+scale_feat_mceplf0cap[-args.mcep_dim:] #logging.info(min_mcep_bound) #logging.info(max_mcep_bound) #write_hdf5(args.stats, "/min_mcep_bound", min_mcep_bound) #write_hdf5(args.stats, "/max_mcep_bound", max_mcep_bound) mean_feat_orglf0 = scaler_feat_orglf0.mean_ scale_feat_orglf0 = scaler_feat_orglf0.scale_ gv_range_mean = np.mean(np.array(var_range), axis=0) gv_range_var = np.var(np.array(var_range), axis=0) logging.info(gv_range_mean) logging.info(gv_range_var) f0_range_mean = np.mean(f0s_range) f0_range_std = np.std(f0s_range) logging.info(f0_range_mean) logging.info(f0_range_std) lf0_range_mean = np.mean(np.log(f0s_range)) lf0_range_std = np.std(np.log(f0s_range)) logging.info(lf0_range_mean) logging.info(lf0_range_std) logging.info(mean_feat_mceplf0cap) logging.info(scale_feat_mceplf0cap) write_hdf5(args.stats, "/mean_feat_mceplf0cap", mean_feat_mceplf0cap) write_hdf5(args.stats, "/scale_feat_mceplf0cap", scale_feat_mceplf0cap) logging.info(mean_feat_orglf0) logging.info(scale_feat_orglf0) write_hdf5(args.stats, "/mean_feat_org_lf0", mean_feat_orglf0) write_hdf5(args.stats, "/scale_feat_org_lf0", scale_feat_orglf0) write_hdf5(args.stats, "/gv_range_mean", gv_range_mean) write_hdf5(args.stats, "/gv_range_var", gv_range_var) write_hdf5(args.stats, "/f0_range_mean", f0_range_mean) write_hdf5(args.stats, "/f0_range_std", f0_range_std) write_hdf5(args.stats, "/lf0_range_mean", lf0_range_mean) write_hdf5(args.stats, "/lf0_range_std", lf0_range_std) mean_melsp = scaler_melsp.mean_ scale_melsp = scaler_melsp.scale_ #logging.info("melsp_bound") #min_melsp_bound = min_melsp-scale_melsp #max_melsp_bound = max_melsp+scale_melsp #logging.info(min_melsp_bound) #logging.info(max_melsp_bound) #write_hdf5(args.stats, "/min_melsp_bound", min_melsp_bound) #write_hdf5(args.stats, "/max_melsp_bound", max_melsp_bound) gv_melsp_mean = np.mean(np.array(var_melsp), axis=0) gv_melsp_var = np.var(np.array(var_melsp), axis=0) logging.info(gv_melsp_mean) logging.info(gv_melsp_var) logging.info(mean_melsp) logging.info(scale_melsp) write_hdf5(args.stats, "/mean_melsp", mean_melsp) write_hdf5(args.stats, "/scale_melsp", scale_melsp) write_hdf5(args.stats, "/gv_melsp_mean", gv_melsp_mean) write_hdf5(args.stats, "/gv_melsp_var", gv_melsp_var) scaler_melworldsp = StandardScaler() scaler_melworldsp.partial_fit(melworldsp) mean_melworldsp = scaler_melworldsp.mean_ scale_melworldsp = scaler_melworldsp.scale_ #logging.info("melworldsp_bound") #min_melworldsp_bound = min_melworldsp-scale_melworldsp #max_melworldsp_bound = max_melworldsp+scale_melworldsp #logging.info(min_melworldsp_bound) #logging.info(max_melworldsp_bound) #write_hdf5(args.stats, "/min_melworldsp_bound", min_melworldsp_bound) #write_hdf5(args.stats, "/max_melworldsp_bound", max_melworldsp_bound) gv_melworldsp_mean = np.mean(np.array(var_melworldsp), axis=0) gv_melworldsp_var = np.var(np.array(var_melworldsp), axis=0) logging.info(gv_melworldsp_mean) logging.info(gv_melworldsp_var) logging.info(mean_melworldsp) logging.info(scale_melworldsp) write_hdf5(args.stats, "/mean_melworldsp", mean_melworldsp) write_hdf5(args.stats, "/scale_melworldsp", scale_melworldsp) write_hdf5(args.stats, "/gv_melworldsp_mean", gv_melworldsp_mean) write_hdf5(args.stats, "/gv_melworldsp_var", gv_melworldsp_var)
import os import sys sys.path.insert(1, os.path.realpath('../lib/py/')) import utils import argparse import numpy as np def get_parser(): """ Get parser. :return: parser :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser() parser.add_argument('--code_size', type=int) parser.add_argument('--number', type=int) return parser if __name__ == '__main__': parser = get_parser() args = parser.parse_args() codes = np.random.randn(args.code_size, args.number) utils.write_hdf5( 'codes_' + str(args.code_size) + '_' + str(args.number) + '.h5', codes)
def gpu_decode(feat_list, gpu, cvlist=None, lsd_cvlist=None, lsdstd_cvlist=None, cvlist_dv=None, lsd_cvlist_dv=None, lsdstd_cvlist_dv=None, f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, cvlist_cyc=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, cvlist_cyc_dv=None, lsd_cvlist_cyc_dv=None, lsdstd_cvlist_cyc_dv=None, f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, excit_dim=config.excit_dim, out_dim=config.mel_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=False, ar=False, pad_first=True, right_size=config.right_size_lf0) logging.info(model_decoder_excit) if (config.spkidtr_dim > 0): model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict( torch.load(args.model)["model_decoder_excit"]) if (config.spkidtr_dim > 0): model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() if (config.spkidtr_dim > 0): model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_decoder_excit.eval() if (config.spkidtr_dim > 0): model_spkidtr.eval() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False if (config.spkidtr_dim > 0): for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_excit.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_excit.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_excit.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_excit.pad_right outpad_lefts[2] = outpad_lefts[1] - model_decoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_decoder_melsp.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder_melsp.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder_melsp.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder_excit.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder_excit.pad_right for feat_file in feat_list: # reconst. melsp logging.info("recmelsp " + feat_file) feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long() cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_src_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_src[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_src_e[:, model_decoder_excit.pad_left:], lat_src[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_src, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_src[:, :, :config.excit_dim]) spk_logits, _, lat_rec, _ = model_encoder_melsp( cvmelsp_src, sampling=False) spk_logits_e, _, lat_rec_e, _ = model_encoder_excit( cvmelsp_src, sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long() cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_rec_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_rec[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_rec_e[:, model_decoder_excit.pad_left:], lat_rec[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_cyc, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_cyc[:, :, :config.excit_dim]) if outpad_rights[1] > 0: cvlf0_src = cvlf0_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvlf0_src = cvlf0_src[:, outpad_lefts[1]:] if outpad_rights[2] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]: -outpad_rights[2]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:] if outpad_rights[4] > 0: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]: -outpad_rights[4]] else: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:] feat_rec = cvmelsp_src[0].cpu().data.numpy() feat_cyc = cvmelsp_cyc[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmelsp_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) feat_world = read_hdf5(feat_file, "/feat_mceplf0cap") f0 = np.array( np.rint(feat_world[:, 0]) * np.exp(feat_world[:, 1])) codeap = np.array( np.rint(feat_world[:, 2:3]) * (-np.exp(feat_world[:, 3:config.full_excit_dim]))) cvf0_src = np.array( np.rint(cvlf0_src[:, 0]) * np.exp(cvlf0_src[:, 1])) cvcodeap_src = np.array( np.rint(cvlf0_src[:, 2:3]) * (-np.exp(cvlf0_src[:, 3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src - f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt( np.mean((cvcodeap_src - codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i + 1, codeap_rmse[i])) cvf0_cyc = np.array( np.rint(cvlf0_cyc[:, 0]) * np.exp(cvlf0_cyc[:, 1])) cvcodeap_cyc = np.array( np.rint(cvlf0_cyc[:, 2:3]) * (-np.exp(cvlf0_cyc[:, 3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc - f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt( np.mean((cvcodeap_cyc - codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i + 1, codeap_rmse_cyc[i])) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_rec: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) cvlist.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) cvlist_cyc.append(np.var(melsp_cyc_rest, axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) lsd_cvlist_dv.append(lsd_mean) lsdstd_cvlist_dv.append(lsd_std) cvlist_dv.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) lsd_cvlist_cyc_dv.append(lsd_mean_cyc) lsdstd_cvlist_cyc_dv.append(lsd_std_cyc) cvlist_cyc_dv.append(np.var(melsp_cyc_rest, axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def main(): parser = argparse.ArgumentParser() # decode setting parser.add_argument("--feats", required=True, type=str, help="list or directory of source eval feat files") parser.add_argument("--spk", required=True, type=str, help="speaker name to be reconstructed") parser.add_argument("--model", required=True, type=str, help="model file") parser.add_argument("--config", required=True, type=str, help="configure file") parser.add_argument("--n_gpus", default=1, type=int, help="number of gpus") parser.add_argument("--outdir", required=True, type=str, help="directory to save log") parser.add_argument("--string_path", required=True, type=str, help="path of h5 generated feature") # other setting parser.add_argument("--GPU_device", default=None, type=int, help="selection of GPU device") parser.add_argument("--GPU_device_str", default=None, type=str, help="selection of GPU device") parser.add_argument("--verbose", default=1, type=int, help="log level") args = parser.parse_args() if args.GPU_device is not None or args.GPU_device_str is not None: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if args.GPU_device_str is None: os.environ["CUDA_VISIBLE_DEVICES"] = str(args.GPU_device) else: os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU_device_str # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # set log level if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # load config config = torch.load(args.config) # get source feat list if os.path.isdir(args.feats): feat_list = sorted(find_files(args.feats, "*.h5")) elif os.path.isfile(args.feats): feat_list = read_txt(args.feats) else: logging.error("--feats should be directory or list.") sys.exit(1) # prepare the file list for parallel decoding feat_lists = np.array_split(feat_list, args.n_gpus) feat_lists = [f_list.tolist() for f_list in feat_lists] for i in range(args.n_gpus): logging.info('%d: %d' % (i + 1, len(feat_lists[i]))) spk_list = config.spk_list.split('@') n_spk = len(spk_list) spk_idx = spk_list.index(args.spk) stats_list = config.stats_list.split('@') assert (n_spk == len(stats_list)) spk_stat = stats_list[spk_idx] gv_mean = read_hdf5(spk_stat, "/gv_range_mean")[1:] model_epoch = os.path.basename(args.model).split('.')[0].split('-')[1] logging.info('epoch: ' + model_epoch) model_name = os.path.basename(os.path.dirname(args.model)).split('_')[1] logging.info('mdl_name: ' + model_name) logging.info(config) # define gpu decode function def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim + config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, pad_first=True, ar=config.ar_dec) logging.info(model_decoder) model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim) logging.info(model_vq) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_vq.load_state_dict(torch.load(args.model)["model_vq"]) model_encoder.cuda() model_decoder.cuda() model_vq.cuda() model_encoder.eval() model_decoder.eval() model_vq.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_vq.parameters(): param.requires_grad = False if config.ar_enc: yz_in = torch.zeros((1, 1, n_spk + config.lat_dim)).cuda() if config.ar_dec: mean_stats = torch.FloatTensor( read_hdf5( config.stats, "/mean_" + config.string_path.replace("/", ""))) scale_stats = torch.FloatTensor( read_hdf5( config.stats, "/scale_" + config.string_path.replace("/", ""))) x_in = ((torch.zeros((1, 1, config.mcep_dim)) - mean_stats[config.excit_dim:]) / scale_stats[config.excit_dim:]).cuda() count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left) * 2 pad_right = (model_encoder.pad_right + model_decoder.pad_right) * 2 outpad_lefts = [None] * 3 outpad_rights = [None] * 3 outpad_lefts[0] = pad_left - model_encoder.pad_left outpad_rights[0] = pad_right - model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1] - model_encoder.pad_left outpad_rights[2] = outpad_rights[1] - model_encoder.pad_right for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) mcep = np.array(feat_org[:, -model_decoder.out_dim:]) with torch.no_grad(): feat = torch.FloatTensor(feat_org).cuda().unsqueeze(0) feat_excit = feat[:, :, :config.excit_dim] if config.ar_enc: spk_logits, lat_src, _, _ = model_encoder(F.pad(feat.transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2), \ yz_in=yz_in) else: spk_logits, lat_src, _ = model_encoder( F.pad(feat.transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2)) idx_vq = nn_search_batch(lat_src, model_vq.weight) lat_src = model_vq(idx_vq) if outpad_rights[0] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:-outpad_rights[0]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_src.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_src, _, _ = model_decoder(src_code, lat_src, x_in=x_in) else: cvmcep_src, _ = model_decoder(src_code, lat_src) if config.ar_enc: spk_logits, lat_rec, _, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2), yz_in=yz_in) else: spk_logits, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2)) idx_vq = nn_search_batch(lat_rec, model_vq.weight) lat_rec = model_vq(idx_vq) if outpad_rights[2] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:-outpad_rights[2]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("rec vq") logging.info(dict(zip(unique, counts))) logging.info('rec spkpost') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_rec.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_cyc, _, _ = model_decoder(src_code, lat_rec, x_in=x_in) else: cvmcep_cyc, _ = model_decoder(src_code, lat_rec) if outpad_rights[1] > 0: cvmcep_src = cvmcep_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvmcep_src = cvmcep_src[:, outpad_lefts[1]:] feat_rec = torch.cat((feat_excit, cvmcep_src), 2)[0].cpu().data.numpy() feat_cyc = torch.cat((feat_excit, cvmcep_cyc), 2)[0].cpu().data.numpy() cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep_src.shape) logging.info(cvmcep_cyc.shape) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist)) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist_dv)) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1 #if count >= 5: # break # parallel decode training with mp.Manager() as manager: gpu = 0 processes = [] cvlist = manager.list() mcd_cvlist = manager.list() mcdstd_cvlist = manager.list() mcdpow_cvlist = manager.list() mcdpowstd_cvlist = manager.list() cvlist_dv = manager.list() mcd_cvlist_dv = manager.list() mcdstd_cvlist_dv = manager.list() mcdpow_cvlist_dv = manager.list() mcdpowstd_cvlist_dv = manager.list() cvlist_cyc = manager.list() mcd_cvlist_cyc = manager.list() mcdstd_cvlist_cyc = manager.list() mcdpow_cvlist_cyc = manager.list() mcdpowstd_cvlist_cyc = manager.list() cvlist_cyc_dv = manager.list() mcd_cvlist_cyc_dv = manager.list() mcdstd_cvlist_cyc_dv = manager.list() mcdpow_cvlist_cyc_dv = manager.list() mcdpowstd_cvlist_cyc_dv = manager.list() for i, feat_list in enumerate(feat_lists): logging.info(i) p = mp.Process(target=gpu_decode, args=(feat_list, gpu, cvlist, mcdpow_cvlist, mcdpowstd_cvlist, \ mcd_cvlist, mcdstd_cvlist, cvlist_dv, mcdpow_cvlist_dv, \ mcdpowstd_cvlist_dv, mcd_cvlist_dv, mcdstd_cvlist_dv,\ cvlist_cyc, mcdpow_cvlist_cyc, mcdpowstd_cvlist_cyc, \ mcd_cvlist_cyc, mcdstd_cvlist_cyc, cvlist_cyc_dv, mcdpow_cvlist_cyc_dv, \ mcdpowstd_cvlist_cyc_dv, mcd_cvlist_cyc_dv, mcdstd_cvlist_cyc_dv,)) p.start() processes.append(p) gpu += 1 if (i + 1) % args.n_gpus == 0: gpu = 0 # wait for all process for p in processes: p.join() # calculate cv_gv statistics if len(mcdpow_cvlist) > 0: logging.info("mcdpow_rec: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcdpow_cvlist)), \ np.std(np.array(mcdpow_cvlist)),np.mean(np.array(mcdpowstd_cvlist)),\ np.std(np.array(mcdpowstd_cvlist)))) logging.info("mcd_rec: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcd_cvlist)), \ np.std(np.array(mcd_cvlist)),np.mean(np.array(mcdstd_cvlist)),\ np.std(np.array(mcdstd_cvlist)))) cvgv_mean = np.mean(np.array(cvlist), axis=0) cvgv_var = np.var(np.array(cvlist), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("mcdpow_cyc: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcdpow_cvlist_cyc)), \ np.std(np.array(mcdpow_cvlist_cyc)),np.mean(np.array(mcdpowstd_cvlist_cyc)),\ np.std(np.array(mcdpowstd_cvlist_cyc)))) logging.info("mcd_cyc: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcd_cvlist_cyc)), \ np.std(np.array(mcd_cvlist_cyc)),np.mean(np.array(mcdstd_cvlist_cyc)),\ np.std(np.array(mcdstd_cvlist_cyc)))) cvgv_mean = np.mean(np.array(cvlist_cyc), axis=0) cvgv_var = np.var(np.array(cvlist_cyc), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) cvgv_mean = np.mean(np.array(np.r_[cvlist, cvlist_cyc]), axis=0) cvgv_var = np.var(np.array(np.r_[cvlist, cvlist_cyc]), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) string_path = model_name + "-" + str(config.detach) + "-" + str( config.n_half_cyc) + "-" + str(config.lat_dim) + "-" + str( config.ctr_size) + "-" + str(config.ar_enc) + "-" + str( config.ar_dec) + "-" + model_epoch logging.info(string_path) string_mean = "/recgv_mean_" + string_path string_var = "/recgv_var_" + string_path write_hdf5(spk_stat, string_mean, cvgv_mean) write_hdf5(spk_stat, string_var, cvgv_var) if len(mcdpow_cvlist_dv) > 0: logging.info("mcdpow_rec_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcdpow_cvlist_dv)), \ np.std(np.array(mcdpow_cvlist_dv)),np.mean(np.array(mcdpowstd_cvlist_dv)),\ np.std(np.array(mcdpowstd_cvlist_dv)))) logging.info("mcd_rec_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcd_cvlist_dv)), \ np.std(np.array(mcd_cvlist_dv)),np.mean(np.array(mcdstd_cvlist_dv)),\ np.std(np.array(mcdstd_cvlist_dv)))) cvgv_mean = np.mean(np.array(cvlist_dv), axis=0) cvgv_var = np.var(np.array(cvlist_dv), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))))) logging.info("mcdpow_cyc_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcdpow_cvlist_cyc_dv)), \ np.std(np.array(mcdpow_cvlist_cyc_dv)),np.mean(np.array(mcdpowstd_cvlist_cyc_dv)),\ np.std(np.array(mcdpowstd_cvlist_cyc_dv)))) logging.info("mcd_cyc_dv: %.6f dB (+- %.6f) +- %.6f (+- %.6f)" % (np.mean(np.array(mcd_cvlist_cyc_dv)), \ np.std(np.array(mcd_cvlist_cyc_dv)),np.mean(np.array(mcdstd_cvlist_cyc_dv)),\ np.std(np.array(mcdstd_cvlist_cyc_dv)))) cvgv_mean = np.mean(np.array(cvlist_cyc_dv), axis=0) cvgv_var = np.var(np.array(cvlist_cyc_dv), axis=0) logging.info("%lf +- %lf" % (np.mean(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean)))), \ np.std(np.sqrt(np.square(np.log(cvgv_mean)-np.log(gv_mean))))))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats_src", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--feats_trg", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--feats_trg_all", default=None, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, help="filename of hdf5 format") parser.add_argument("--stats_trg", default=None, help="filename of hdf5 format") parser.add_argument("--expdir", required=True, type=str, help="directory to save the log") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filename=args.expdir + "/calc_stats.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # define scaler #scaler_sdfeatuvcat_range = StandardScaler() scaler_feat_org_lf0_jnt = StandardScaler() if args.feats_trg_all is not None: scaler_feat_org_lf0_trg_jnt = StandardScaler() # read source list filenames = read_txt(args.feats_src) print("number of source training utterances =", len(filenames)) for filename in filenames: #sdfeatuv_cat_range = read_hdf5(filename, "/sdfeat_uv_cat_range") #scaler_sdfeatuvcat_range.partial_fit(sdfeatuv_cat_range[:, :]) feat_org_lf0 = read_hdf5(filename, "/feat_org_lf0") scaler_feat_org_lf0_jnt.partial_fit(feat_org_lf0[:, :]) # read target list filenames = read_txt(args.feats_trg) print("number of target training utterances =", len(filenames)) for filename in filenames: #sdfeatuv_cat_range = read_hdf5(filename, "/sdfeat_uv_cat_range") #scaler_sdfeatuvcat_range.partial_fit(sdfeatuv_cat_range[:, :]) feat_org_lf0 = read_hdf5(filename, "/feat_org_lf0") scaler_feat_org_lf0_jnt.partial_fit(feat_org_lf0[:, :]) if args.feats_trg_all is not None: scaler_feat_org_lf0_trg_jnt.partial_fit(feat_org_lf0[:, :]) if args.feats_trg_all is not None: # read target all list filenames = read_txt(args.feats_trg_all) print("number of target all training utterances =", len(filenames)) for filename in filenames: #sdfeatuv_cat_range = read_hdf5(filename, "/sdfeat_uv_cat_range") #scaler_sdfeatuvcat_range.partial_fit(sdfeatuv_cat_range[:, :]) feat_org_lf0 = read_hdf5(filename, "/feat_org_lf0") scaler_feat_org_lf0_jnt.partial_fit(feat_org_lf0[:, :]) scaler_feat_org_lf0_trg_jnt.partial_fit(feat_org_lf0[:, :]) #mean_sdfeatuvcat_range = scaler_sdfeatuvcat_range.mean_ #scale_sdfeatuvcat_range = scaler_sdfeatuvcat_range.scale_ mean_feat_org_lf0_jnt = scaler_feat_org_lf0_jnt.mean_ scale_feat_org_lf0_jnt = scaler_feat_org_lf0_jnt.scale_ if args.feats_trg_all is not None: mean_feat_org_lf0_trg_jnt = scaler_feat_org_lf0_trg_jnt.mean_ scale_feat_org_lf0_trg_jnt = scaler_feat_org_lf0_trg_jnt.scale_ # write to hdf5 #write_hdf5(args.stats, "/mean_sdfeat_uv_cat_range", mean_sdfeatuvcat_range) #write_hdf5(args.stats, "/scale_sdfeat_uv_cat_range", scale_sdfeatuvcat_range) print(mean_feat_org_lf0_jnt) print(scale_feat_org_lf0_jnt) if args.feats_trg_all is not None: print(mean_feat_org_lf0_trg_jnt) print(scale_feat_org_lf0_trg_jnt) write_hdf5(args.stats, "/mean_feat_org_lf0_jnt", mean_feat_org_lf0_jnt) write_hdf5(args.stats, "/scale_feat_org_lf0_jnt", scale_feat_org_lf0_jnt) if args.feats_trg_all is not None: write_hdf5(args.stats_trg, "/mean_feat_org_lf0_trg_jnt", mean_feat_org_lf0_trg_jnt) write_hdf5(args.stats_trg, "/scale_feat_org_lf0_trg_jnt", scale_feat_org_lf0_trg_jnt)