def get_aligned_jointdata(orgdata, orgnpow, tardata, tarnpow, cvdata=None): """Get aligment between features Paramters --------- orgdata : array, shape (`T_org`, `dim`) Acoustic feature of source speaker orgnpow : array, shape (`T_org`) Normalized power of soruce speaker orgdata : array, shape (`T_tar`, `dim`) Acoustic feature of target speaker orgnpow : array, shape (`T_tar`) Normalized power of target speaker cvdata : array, optiona, shape (`T_org`, `dim`) Converted acoustic feature from source into target Returns --------- jdata : array, shape (`T_new` `dim * 2`) Joint feature vector between source and target twf : array, shape (`T_new`, `2`) Time warping function mcd : float, Mel-cepstrum distortion between source and target """ # extract extsddata org_extsddata = extfrm(static_delta(orgdata), orgnpow) tar_extsddata = extfrm(static_delta(tardata), tarnpow) if cvdata is None: # calculate twf and mel-cd twf = estimate_twf(org_extsddata, tar_extsddata, distance='melcd') mcd = melcd(org_extsddata[twf[0]], tar_extsddata[twf[1]]) else: if orgdata.shape != cvdata.shape: raise ValueError('Dimension mismatch between orgdata and cvdata: \ {} {}'.format(orgdata.shape, cvdata.shape)) # calculate twf and mel-cd with converted data cv_extsddata = extfrm(static_delta(cvdata), orgnpow) twf = estimate_twf(cv_extsddata, tar_extsddata, distance='melcd') mcd = melcd(cv_extsddata[twf[0]], tar_extsddata[twf[1]]) # concatenate joint feature data into joint feature matrix jdata = np.c_[org_extsddata[twf[0]], tar_extsddata[twf[1]]] return jdata, twf, mcd
def exts_post_FA(data, FA, delta=False): phone = [] time = [] with open(FA, 'r') as f: for i in csv.reader(f, dialect='excel', delimiter='\t'): phone.append(i[1]) time.append(i[0]) sil_index = [i for i, x in enumerate(phone) if x == 'sil'] reject_frames = [] for i in sil_index: [start_time, end_time] = time[i].split(' ') for frame in range( int(float(start_time) * 100) + 1, int(float(end_time) * 100) + 1): reject_frames.append(frame) total_frames = list( set(range(0, int(float(time[-1].split(' ')[1]) * 100))) - set(reject_frames)) if delta == False: return (np.array([data[i] for i in total_frames])) else: new_data = [] new_data = np.array([data[i] for i in total_frames]) return (static_delta(new_data))
def extsddata(data, npow, power_threshold=-20): """Get power extract static and delta feature vector Paramters --------- data : array, shape (`T`, `dim`) Acoustic feature vector npow : array, shape (`T`) Normalized power vector power_threshold : float, optional, Power threshold Default set to -20 Returns ------- extsddata : array, shape (`T_new` `dim * 2`) Silence remove static and delta feature vector """ if len(data) != len(npow): npow = npow[:len(data)] extsddata = extfrm(static_delta(data), npow, power_threshold=power_threshold) return extsddata
def generate_align_indexes(pair_path: Tuple[Path, Path]): path1, path2 = pair_path if path1.stem != path2.stem: print('warning: the file names are different', path1, path2) out = Path(arguments.output, path1.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # original wave = Wave.load(path=path1, sampling_rate=sconf1.wav_fs) wave = wave.pad(pre_second=arguments.pad_second1, post_second=arguments.pad_second1) x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70) feat1.analyze(x) mcep = feat1.mcep(dim=sconf1.mcep_dim, alpha=sconf1.mcep_alpha) if arguments.threshold_db1 is not None: indexes = wave.get_effective_frame( threshold_db=arguments.threshold_db1, fft_length=sconf1.wav_fftl, frame_period=sconf1.wav_shiftms, ) mcep = mcep[indexes] cvmcep_wopow = mcepgmm.convert(static_delta(mcep[:, 1:]), cvtype=pconf.GMM_mcep_cvtype) mcep1 = numpy.c_[mcep[:, 0], cvmcep_wopow] # target wave = Wave.load(path=path2, sampling_rate=sconf2.wav_fs) wave = wave.pad(pre_second=arguments.pad_second2, post_second=arguments.pad_second2) x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70) feat2.analyze(x) mcep2 = feat2.mcep(dim=sconf2.mcep_dim, alpha=sconf2.mcep_alpha) if arguments.threshold_db2 is not None: indexes = wave.get_effective_frame( threshold_db=arguments.threshold_db2, fft_length=sconf2.wav_fftl, frame_period=sconf2.wav_shiftms, ) mcep2 = mcep2[indexes] # align feature1 = AcousticFeature(mc=mcep1) feature2 = AcousticFeature(mc=mcep2) align_indexes = AlignIndexes.extract(feature1, feature2, dtype=arguments.dtype) align_indexes.save(path=out, validate=True, ignores=arguments.ignore_feature)
def test_BlockDiagonalGMM(self): jnt = np.random.rand(1000, 20) gmm_tr = GMMTrainer(n_mix=32, n_iter=10, covtype='block_diag') gmm_tr.train(jnt) data = np.random.rand(200, 5) sddata = static_delta(data) gmm_cv = GMMConvertor(n_mix=32, covtype='full', gmmmode=None) gmm_cv.open_from_param(gmm_tr.param) odata = gmm_cv.convert(sddata, cvtype='mlpg') odata = gmm_cv.convert(sddata, cvtype='mmse') assert data.shape == odata.shape
def feature_conversion(pconf, org_mceps, gmm, gmmmode=None): """Conversion of mel-cesptrums Parameters --------- pconf : PairYAML, Class of PairYAML org_mceps : list, shape (`num_mceps`) List of mel-cepstrums gmm : sklean.mixture.GaussianMixture, Parameters of sklearn-based Gaussian mixture gmmmode : str, optional Flag to parameter generation technique `None` : Normal VC `diff` : Differential VC Default set to `None` Returns --------- cv_mceps : list , shape(`num_mceps`) List of converted mel-cespstrums """ cvgmm = GMMConvertor( n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype, gmmmode=gmmmode, ) cvgmm.open_from_param(gmm.param) sd = 1 # start dimension to convert cv_mceps = [] for mcep in org_mceps: mcep_0th = mcep[:, 0] cvmcep = cvgmm.convert(static_delta(mcep[:, sd:]), cvtype=pconf.GMM_mcep_cvtype) cvmcep = np.c_[mcep_0th, cvmcep] if gmmmode == 'diff': cvmcep[:, sd:] += mcep[:, sd:] elif gmmmode is not None: raise ValueError('gmmmode must be `None` or `diff`.') cv_mceps.append(cvmcep) return cv_mceps
def feature_conversion(pconf, org_mceps, gmm, gmmmode=None): """Conversion of mel-cesptrums Parameters --------- pconf : PairYAML, Class of PairYAML org_mceps : list, shape (`num_mceps`) List of mel-cepstrums gmm : sklean.mixture.GaussianMixture, Parameters of sklearn-based Gaussian mixture gmmmode : str, optional Flag to parameter generation technique `None` : Normal VC `diff` : Differential VC Default set to `None` Returns --------- cv_mceps : list , shape(`num_mceps`) List of converted mel-cespstrums """ cvgmm = GMMConvertor(n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype, gmmmode=gmmmode, ) cvgmm.open_from_param(gmm.param) sd = 1 # start dimension to convert cv_mceps = [] for mcep in org_mceps: mcep_0th = mcep[:, 0] cvmcep = cvgmm.convert(static_delta(mcep[:, sd:]), cvtype=pconf.GMM_mcep_cvtype) cvmcep = np.c_[mcep_0th, cvmcep] if gmmmode == 'diff': cvmcep[:, sd:] += mcep[:, sd:] elif gmmmode is not None: raise ValueError('gmmmode must be `None` or `diff`.') cv_mceps.append(cvmcep) return cv_mceps
def extsddata(data, npow, power_threshold=-20): """Get power extract static and delta feature vector Paramters --------- data : array, shape (`T`, `dim`) Acoustic feature vector npow : array, shape (`T`) Normalized power vector power_threshold : float, optional, Power threshold Default set to -20 Returns ------- extsddata : array, shape (`T_new` `dim * 2`) Silence remove static and delta feature vector """ extsddata = extfrm(static_delta(data), npow, power_threshold=power_threshold) return extsddata
def main(*argv): argv = argv if argv else sys.argv[1:] # Options for python description = 'estimate joint feature of source and target speakers' parser = argparse.ArgumentParser(description=description) parser.add_argument('-gmmmode', '--gmmmode', type=str, default=None, help='mode of the GMM [None, diff, or intra]') parser.add_argument('org', type=str, help='Original speaker') parser.add_argument('tar', type=str, help='Target speaker') parser.add_argument('org_yml', type=str, help='Yml file of the original speaker') parser.add_argument('pair_yml', type=str, help='Yml file of the speaker pair') parser.add_argument('eval_list_file', type=str, help='List file for evaluation') parser.add_argument('wav_dir', type=str, help='Directory path of source spekaer') parser.add_argument('pair_dir', type=str, help='Directory path of pair directory') args = parser.parse_args(argv) # read parameters from speaker yml sconf = SpeakerYML(args.org_yml) pconf = PairYML(args.pair_yml) # read GMM for mcep mcepgmmpath = os.path.join(args.pair_dir, 'model/GMM_mcep.pkl') mcepgmm = GMMConvertor( n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype, gmmmode=args.gmmmode, ) param = joblib.load(mcepgmmpath) mcepgmm.open_from_param(param) print("GMM for mcep conversion mode: {}".format(args.gmmmode)) # read F0 statistics stats_dir = os.path.join(args.pair_dir, 'stats') orgstatspath = os.path.join(stats_dir, args.org + '.h5') orgstats_h5 = HDF5(orgstatspath, mode='r') orgf0stats = orgstats_h5.read(ext='f0stats') orgstats_h5.close() # read F0 and GV statistics for target tarstatspath = os.path.join(stats_dir, args.tar + '.h5') tarstats_h5 = HDF5(tarstatspath, mode='r') tarf0stats = tarstats_h5.read(ext='f0stats') targvstats = tarstats_h5.read(ext='gv') tarstats_h5.close() # read GV statistics for converted mcep cvgvstatspath = os.path.join(args.pair_dir, 'model', 'cvgv.h5') cvgvstats_h5 = HDF5(cvgvstatspath, mode='r') cvgvstats = cvgvstats_h5.read(ext='cvgv') diffcvgvstats = cvgvstats_h5.read(ext='diffcvgv') cvgvstats_h5.close() mcepgv = GV() f0stats = F0statistics() # constract FeatureExtractor class feat = FeatureExtractor(analyzer=sconf.analyzer, fs=sconf.wav_fs, fftl=sconf.wav_fftl, shiftms=sconf.wav_shiftms, minf0=sconf.f0_minf0, maxf0=sconf.f0_maxf0) # constract Synthesizer class synthesizer = Synthesizer(fs=sconf.wav_fs, fftl=sconf.wav_fftl, shiftms=sconf.wav_shiftms) # test directory test_dir = os.path.join(args.pair_dir, 'test') os.makedirs(os.path.join(test_dir, args.org), exist_ok=True) # conversion in each evaluation file with open(args.eval_list_file, 'r') as fp: for line in fp: # open wav file f = line.rstrip() wavf = os.path.join(args.wav_dir, f + '.wav') fs, x = wavfile.read(wavf) x = x.astype(np.float) x = low_cut_filter(x, fs, cutoff=70) assert fs == sconf.wav_fs # analyze F0, mcep, and ap f0, spc, ap = feat.analyze(x) mcep = feat.mcep(dim=sconf.mcep_dim, alpha=sconf.mcep_alpha) mcep_0th = mcep[:, 0] # convert F0 cvf0 = f0stats.convert(f0, orgf0stats, tarf0stats) # convert mcep cvmcep_wopow = mcepgmm.convert(static_delta(mcep[:, 1:]), cvtype=pconf.GMM_mcep_cvtype) cvmcep = np.c_[mcep_0th, cvmcep_wopow] # synthesis VC w/ GV if args.gmmmode is None: cvmcep_wGV = mcepgv.postfilter(cvmcep, targvstats, cvgvstats=cvgvstats, alpha=pconf.GV_morph_coeff, startdim=1) wav = synthesizer.synthesis( cvf0, cvmcep_wGV, ap, rmcep=mcep, alpha=sconf.mcep_alpha, ) wavpath = os.path.join(test_dir, f + '_VC.wav') # synthesis DIFFVC w/ GV if args.gmmmode == 'diff': cvmcep[:, 0] = 0.0 cvmcep_wGV = mcepgv.postfilter(mcep + cvmcep, targvstats, cvgvstats=diffcvgvstats, alpha=pconf.GV_morph_coeff, startdim=1) - mcep wav = synthesizer.synthesis_diff( x, cvmcep_wGV, rmcep=mcep, alpha=sconf.mcep_alpha, ) wavpath = os.path.join(test_dir, f + '_DIFFVC.wav') # write waveform if not os.path.exists(os.path.join(test_dir, f)): os.makedirs(os.path.join(test_dir, f)) wav = np.clip(wav, -32768, 32767) wavfile.write(wavpath, fs, wav.astype(np.int16)) print(wavpath)
def main(*argv): argv = argv if argv else sys.argv[1:] # Options for python description = 'estimate joint feature of source and target speakers' parser = argparse.ArgumentParser(description=description) parser.add_argument('pair_yml', type=str, help='Yml file of the speaker pair') parser.add_argument('org_list_file', type=str, help='List file of original speaker') parser.add_argument('tar_list_file', type=str, help='List file of target speaker') parser.add_argument('pair_dir', type=str, help='Directory path of h5 files') args = parser.parse_args(argv) # read pair-dependent yml file pconf = PairYML(args.pair_yml) # read source and target features from HDF file h5_dir = os.path.join(args.pair_dir, 'h5') org_mceps = read_feats(args.org_list_file, h5_dir, ext='mcep') org_npows = read_feats(args.org_list_file, h5_dir, ext='npow') tar_mceps = read_feats(args.tar_list_file, h5_dir, ext='mcep') tar_npows = read_feats(args.tar_list_file, h5_dir, ext='npow') assert len(org_mceps) == len(tar_mceps) assert len(org_npows) == len(tar_npows) assert len(org_mceps) == len(org_npows) itnum = 1 sd = 1 # start dimension for aligment of mcep num_files = len(org_mceps) print('{}-th joint feature extraction starts.'.format(itnum)) # first iteration for i in range(num_files): jdata, _, mcd = get_aligned_jointdata(org_mceps[i][:, sd:], org_npows[i], tar_mceps[i][:, sd:], tar_npows[i]) print('distortion [dB] for {}-th file: {}'.format(i + 1, mcd)) if i == 0: jnt = jdata else: jnt = np.r_[jnt, jdata] itnum += 1 # second through final iteration while itnum < pconf.jnt_n_iter + 1: print('{}-th joint feature extraction starts.'.format(itnum)) # train GMM trgmm = GMMTrainer(n_mix=pconf.GMM_mcep_n_mix, n_iter=pconf.GMM_mcep_n_iter, covtype=pconf.GMM_mcep_covtype) trgmm.train(jnt) cvgmm = GMMConvertor(n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype) cvgmm.open_from_param(trgmm.param) twfs = [] for i in range(num_files): cvmcep = cvgmm.convert(static_delta(org_mceps[i][:, sd:]), cvtype=pconf.GMM_mcep_cvtype) jdata, twf, mcd = get_aligned_jointdata(org_mceps[i][:, sd:], org_npows[i], tar_mceps[i][:, sd:], tar_npows[i], cvdata=cvmcep) print('distortion [dB] for {}-th file: {}'.format(i + 1, mcd)) if i == 0: jnt = jdata else: jnt = np.r_[jnt, jdata] twfs.append(twf) itnum += 1 # save joint feature vector jnt_dir = os.path.join(args.pair_dir, 'jnt') if not os.path.exists(jnt_dir): os.makedirs(jnt_dir) jntpath = os.path.join(jnt_dir, 'it' + str(itnum) + '_jnt.h5') jnth5 = HDF5(jntpath, mode='w') jnth5.save(jnt, ext='jnt') jnth5.close() # save GMM gmm_dir = os.path.join(args.pair_dir, 'GMM') if not os.path.exists(gmm_dir): os.makedirs(gmm_dir) gmmpath = os.path.join(gmm_dir, 'it' + str(itnum) + '_gmm.pkl') joblib.dump(trgmm.param, gmmpath) # save twf twf_dir = os.path.join(args.pair_dir, 'twf') if not os.path.exists(twf_dir): os.makedirs(twf_dir) with open(args.org_list_file, 'r') as fp: for line, twf in zip(fp, twfs): f = os.path.basename(line.rstrip()) twfpath = os.path.join(twf_dir, 'it' + str(itnum) + '_' + f + '.h5') twfh5 = HDF5(twfpath, mode='w') twfh5.save(twf, ext='twf') twfh5.close()
def align_feature_vectors(odata, onpows, tdata, tnpows, pconf, opow=-100, tpow=-100, itnum=3, sd=0, given_twfs=None, otflag=None): """Get alignment to create joint feature vector Paramters --------- odata : list, (`num_files`) List of original feature vectors onpows : list , (`num_files`) List of original npows tdata : list, (`num_files`) List of target feature vectors tnpows : list , (`num_files`) List of target npows opow : float, optional, Power threshold of original Default set to -100 tpow : float, optional, Power threshold of target Default set to -100 itnum : int , optional, The number of iteration Default set to 3 sd : int , optional, Start dimension of feature vector to be used for alignment Default set to 0 given_twf : array, shape (`T_new` `dim * 2`) Use given alignment while 1st iteration Default set to None otflag : str, optional Alignment into the length of specification 'org' : alignment into original length 'tar' : alignment into target length Default set to None Returns ------- jfvs : list, List of joint feature vectors twfs : list, List of time warping functions """ num_files = len(odata) cvgmm, cvdata = None, None for it in range(1, itnum + 1): print('{}-th joint feature extraction starts.'.format(it)) twfs, jfvs = [], [] for i in range(num_files): if it == 1 and given_twfs is not None: gtwf = given_twfs[i] else: gtwf = None if it > 1: cvdata = cvgmm.convert(static_delta(odata[i][:, sd:]), cvtype=pconf.GMM_mcep_cvtype) jdata, twf, mcd = get_alignment(odata[i], onpows[i], tdata[i], tnpows[i], opow=opow, tpow=tpow, sd=sd, cvdata=cvdata, given_twf=gtwf, otflag=otflag) twfs.append(twf) jfvs.append(jdata) print('distortion [dB] for {}-th file: {}'.format(i + 1, mcd)) jnt_data = transform_jnt(jfvs) if it != itnum: # train GMM, if not final iteration datagmm = GMMTrainer(n_mix=pconf.GMM_mcep_n_mix, n_iter=pconf.GMM_mcep_n_iter, covtype=pconf.GMM_mcep_covtype) datagmm.train(jnt_data) cvgmm = GMMConvertor(n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype) cvgmm.open_from_param(datagmm.param) it += 1 return jfvs, twfs
def align_ppg_feature_vectors(odata, onpows, tdata, tnpows, pconf, s_list_file, tar_list_file, opow=-100, tpow=-100, itnum=3, sd=0, given_twfs=None, otflag=None): s_list_file = np.loadtxt(s_list_file, dtype='str') tar_list_file = np.loadtxt(tar_list_file, dtype='str') num_files = len(odata) cvgmm, cvdata = None, None jfvs = [] for it in range(1, itnum + 1): print('{}-th joint feature extraction starts.'.format(it)) twfs = [] for i in range(num_files): if it == 1 and given_twfs is not None: gtwf = given_twfs[i] else: gtwf = None if it > 1: cvdata = cvgmm.convert(static_delta(odata[i][:, sd:]), cvtype=pconf.GMM_mcep_cvtype) if it == 1: jdata = get_ppg_alignment(odata[i], onpows[i], tdata[i], tnpows[i], s_list_file[i], tar_list_file[i], opow=opow, tpow=tpow, sd=sd, cvdata=cvdata, given_twf=gtwf, otflag=otflag) print(s_list_file[i]) jfvs.append(jdata) jnt_data = transform_jnt(jfvs) _, twf, _ = get_alignment(odata[i], onpows[i], tdata[i], tnpows[i], opow=opow, tpow=tpow, sd=sd, cvdata=cvdata, given_twf=gtwf, otflag=otflag) twfs.append(twf) if it != itnum: # train GMM, if not final iteration print("training GMM") datagmm = GMMTrainer(n_mix=pconf.GMM_mcep_n_mix, n_iter=pconf.GMM_mcep_n_iter, covtype=pconf.GMM_mcep_covtype) datagmm.train(jnt_data) cvgmm = GMMConvertor(n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype) cvgmm.open_from_param(datagmm.param) it += 1 return jfvs, twfs
def align_feature_vectors(odata, onpows, tdata, tnpows, pconf, opow=-100, tpow=-100, itnum=3, sd=0, given_twfs=None, otflag=None): """Get alignment to create joint feature vector Paramters --------- odata : list, (`num_files`) List of original feature vectors onpows : list , (`num_files`) List of original npows tdata : list, (`num_files`) List of target feature vectors tnpows : list , (`num_files`) List of target npows opow : float, optional, Power threshold of original Default set to -100 tpow : float, optional, Power threshold of target Default set to -100 itnum : int , optional, The number of iteration Default set to 3 sd : int , optional, Start dimension of feature vector to be used for alignment Default set to 0 given_twf : array, shape (`T_new` `dim * 2`) Use given alignment while 1st iteration Default set to None otflag : str, optional Alignment into the length of specification 'org' : alignment into original length 'tar' : alignment into target length Default set to None Returns ------- jdata : array, shape (`T_new` `dim * 2`) Joint static and delta feature vector twf : array, shape (`T_new` `dim * 2`) Time warping function mcd : float , Mel-cepstrum distortion between arrays """ it = 1 num_files = len(odata) cvgmm, cvdata = None, None for it in range(1, itnum + 1): print('{}-th joint feature extraction starts.'.format(it)) # alignment twfs, jfvs = [], [] for i in range(num_files): if it == 1 and given_twfs is not None: gtwf = given_twfs[i] else: gtwf = None if it > 1: cvdata = cvgmm.convert(static_delta(odata[i][:, sd:]), cvtype=pconf.GMM_mcep_cvtype) jdata, twf, mcd = get_alignment(odata[i], onpows[i], tdata[i], tnpows[i], opow=opow, tpow=tpow, sd=sd, cvdata=cvdata, given_twf=gtwf, otflag=otflag) twfs.append(twf) jfvs.append(jdata) print('distortion [dB] for {}-th file: {}'.format(i + 1, mcd)) jnt_data = transform_jnt(jfvs) if it != itnum: # train GMM, if not final iteration datagmm = GMMTrainer(n_mix=pconf.GMM_mcep_n_mix, n_iter=pconf.GMM_mcep_n_iter, covtype=pconf.GMM_mcep_covtype) datagmm.train(jnt_data) cvgmm = GMMConvertor(n_mix=pconf.GMM_mcep_n_mix, covtype=pconf.GMM_mcep_covtype) cvgmm.open_from_param(datagmm.param) it += 1 return jfvs, twfs