def process_utterance(self, utt, make_label=True): utt_data = [] utt_questions = defaultdict(int) ## {} nodelist = utt.xpath(self.target_nodes) if nodelist == []: print( 'WARNING: FeatureDumper\'s target_nodes matches no nodes: %s' % (self.config["target_nodes"])) for node in nodelist: node_data, node_questions = self.get_node_context_label(node) utt_data.append(node_data) ##utt_questions.update(node_questions) ## Sum the dictionaries' values: for question in node_questions: utt_questions[question] += node_questions[question] if make_label: label_file = utt.get_filename(self.output_filetype) if self.binary_output: utt_data = [line.split(' ') for line in utt_data] ## In case of string data being present, following line will give: ## ValueError: could not convert string to float: a utt_data = numpy.array(utt_data, dtype='float') put_speech(utt_data, label_file) else: writelist(utt_data, label_file, uni=True) return (utt_data, utt_questions ) ## for writing utterance-level labels,
def generate_from_norm_binary_lab(self, bin_label_file, labdim, outwave, enforce_silence=False, mlpg=True, vuv_thresh=0.5, fzero_scale=1.0): input = get_speech(bin_label_file, labdim) # input = input[:500,:] output = self.predict(input, input_normalisation=True) put_speech(output, '/afs/inf.ed.ac.uk/user/o/owatts/temp/cpu_gen/undenorm_66_015_from_norm_lab.cmp') sys.exit('vliadnviadnvdvn stoped early') streams = self.split_into_streams(output) if mlpg: mlpged = {} for (stream, data) in streams.items(): if stream in self.indims: mlpg_data = self.param_generator.generation(data, self.stream_std[stream], self.indims[stream]) else: mlpg_data = data mlpged[stream] = mlpg_data streams = mlpged else: # take statics only! statics = {} for (stream, data) in streams.items(): if stream in self.indims: statics[stream] = data[:, :self.indims[stream]] else: ## for e.g. vuv statics[stream] = data streams = statics if enforce_silence: for (stream, data) in streams.items(): print input[:, self.silent_feature_indices] sys.exit('ntfbdfbsfrbsfbr') silent_frames = numpy.sum(input[:, self.silent_feature_indices], axis=1) data[silent_frames == 1.0, :] = 0.0 streams[stream] = data if 'lf0' in streams: fzero = numpy.exp(streams['lf0']) if 'vuv' in streams: vuv = streams['vuv'] lf0 = streams['lf0'] fzero[vuv <= vuv_thresh] = 0.0 fzero *= fzero_scale streams['lf0'] = fzero self.world_resynth(streams, outwave)
def world_resynth(self, streams, outfile): ''' refactored this from AcousticModel. TODO: clean up more, and replace also in AM ''' bin_dir = self.hts_dir ## world here too # alpha = 0.42 # order = 39 # fftl = 1024 # sr = 16000 alpha = self.alpha # 0.71 order = self.mcep_order # 59 sr = self.sample_rate # 44100 fftl = self.fftl for (stream, data) in streams.items(): put_speech(data, '/tmp/tmp.%s'%(stream)) comm=bin_dir+"/x2x +fd /tmp/tmp."+stream+" >/tmp/tmp_d."+stream print comm os.system(comm) comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 /tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > /tmp/tmp.spec"%(bin_dir, alpha, order, fftl, bin_dir, bin_dir) print comm os.system(comm) '''Avoid: x2x : error: input data is over the range of type 'double'! -o : clip by minimum and maximum of output data type if input data is over the range of output data type. ''' comm = "%s/synth %s %s /tmp/tmp_d.lf0 /tmp/tmp.spec /tmp/tmp_d.bap /tmp/tmp.resyn.wav"%(bin_dir, fftl, sr) print comm os.system(comm) os.system("mv /tmp/tmp.resyn.wav "+outfile) print 'Produced %s'%(outfile)
def process_utterance(self, utt): ## If there is no waveform attached to the utt, don't do anything: if not utt.has_attribute("waveform"): return ## Add some data to the utt structure recording the structure of the ## associated acoustic features we've produced. Do this first, in case ## we use existing features. self.stream_sizes[ 1] = '1' ## otherwise '1 1 1' for F0 TODO: fix this nicely! utt.add_acoustic_stream_info(self.feats, self.stream_sizes) ## If a feature file already exists, skip: if utt.has_external_data(self.output_filetype): ## TODO: check description against existing feats? return ## else extract features infile = utt.get("waveform") outfile = utt.get_filename(self.output_filetype) ## strip suffix .cmp:- assert outfile.endswith('.' + self.output_filetype) chars_to_strip = len(self.output_filetype) + 1 outstem = outfile[:-chars_to_strip] rate = self.rate sample_rate = self.rate alpha = self.alpha order = self.order fftl = self.fftl apsize = self.apsize frameshift_ms = self.frameshift_ms script_dir = self.voice_resources.path[c.SCRIPT] ## 1) remove wave header, downsample etc. with sox: comm = "sox -t wav " + infile comm += " -c 1 -e signed-integer " comm += " -r %s" % (rate) comm += " -b 16 " comm += " " + outstem + ".wav" comm += " dither" ## added for hi and rj data blizz 2014 success = os.system(comm) if success != 0: print 'sox failed on utterance ' + utt.get("utterance_name") return comm = "%s/analysis %s.wav %s.f0.double %s.sp.double %s.bap.double > %s.log" % ( self.tool, outstem, outstem, outstem, outstem, outstem) success = os.system(comm) # This command is very slow # print comm if success != 0: print 'world analysis failed on utterance ' + utt.get( "utterance_name") return if self.resynthesise_training_data: ## resynthesis to test comm = "%s/synth %s %s %s.f0.double %s.sp.double %s.bap.double %s.resyn.wav > %s.log" % ( self.tool, fftl, rate, outstem, outstem, outstem, outstem, outstem) success = os.system(comm) if success != 0: print 'world synthesis failed on utterance ' + utt.get( "utterance_name") return comm = "%s/x2x +df %s.sp.double | %s/sopr -R -m 32768.0 | %s/mcep -a %s -m %s -l %s -j 0 -f 0.0 -q 3 > %s.mgc" % ( self.tool, outstem, self.tool, self.tool, alpha, order, fftl, outstem) ## -e 1.0E-8 success = os.system(comm) # This command is very slow if success != 0: print 'conversion of world spectrum to mel cepstra failed on utterance ' + utt.get( "utterance_name") return for stream in ['bap']: comm = "%s/x2x +df %s.%s.double > %s.%s" % ( self.tool, outstem, stream, outstem, stream) success = os.system(comm) if success != 0: print 'double -> float conversion (stream: ' + stream + ') failed on utterance ' + utt.get( "utterance_name") return for stream in ['f0']: comm = "%s/x2x +da %s.%s.double > %s.%s.txt" % ( self.tool, outstem, stream, outstem, stream) success = os.system(comm) if success != 0: print 'double -> ascii conversion (stream: ' + stream + ') failed on utterance ' + utt.get( "utterance_name") return ## 5) F0 conversion: f0 = [float(val) for val in readlist(outstem + '.f0.txt')] log_f0 = [] for val in f0: if val == 0.0: log_f0.append('-1.0E10') else: log_f0.append(math.log(val)) writelist(log_f0, outstem + '.f0.log') comm = "%s/x2x +af %s.f0.log > %s.lf0" % (self.tool, outstem, outstem) success = os.system(comm) if success != 0: print 'writing log f0 failed on utterance ' + utt.get( "utterance_name") return ## add mcep/ap/f0 deltas: for (stream, dimen) in [('mgc', order + 1), ('bap', apsize), ('lf0', 1)]: comm = "perl %s/window.pl %s " % (script_dir, dimen) comm += "%s.%s %s > %s.%s.delta" % (outstem, stream, ' '.join( self.winfiles), outstem, stream) success = os.system(comm) # This command is very slow if success != 0: print 'delta (' + stream + ') extraction failed on utterance ' + utt.get( "utterance_name") return ### combined streams:-- ap = get_speech(outstem + '.bap.delta', apsize * len(self.winfiles)) mgc = get_speech(outstem + '.mgc.delta', (order + 1) * len(self.winfiles)) lf0 = get_speech(outstem + '.lf0.delta', 1 * len(self.winfiles)) cmp = numpy.hstack([mgc, lf0, ap]) put_speech(cmp, outfile) ## 7) add header floats_per_frame = (order + 2 + apsize) * len( self.winfiles) ## +2 for energy and F0 add_htk_header(outfile, floats_per_frame, frameshift_ms) ## 8) tidy: self.extensions_to_keep = ['.' + self.output_filetype, '.f0.txt'] ## TODO: make configuable? self.extensions_to_keep.append('.resyn.wav') self.extensions_to_keep.extend(['.mgc', '.bap', '.lf0']) keepfiles = [outstem + ending for ending in self.extensions_to_keep] for junk in glob.glob(outstem + '.*'): if not junk in keepfiles: os.remove(junk)
def process_utterance(self, utt): from numpy import loadtxt, savetxt,exp, mean,median if utt.has_attribute("waveform"): #print "Utt has a natural waveform -- don't synthesise" return if not self.trained: print 'WARNING: Cannot apply processor %s till model is trained'%(self.processor_name) return #self.postfilter_coeff = self.postfilter_coeff #self.scale_var = self.config.get('scale_var','n') #self.speech_rate = float(self.config.get('speech_rate',1.0)) self.model_dir = os.path.join(self.get_location()) bin_dir = self.voice_resources.path[c.BIN] label = utt.get_filename(self.input_label_filetype) owave = utt.get_filename(self.output_filetype) # generate parameters with hts_engine, one stream at the time feats = str.split(self.stream_definitions["STREAM_NAMES"]) os.system('mkdir -p ./tmp') #self.vuv = 0.4 for f in feats: #for f in ['mgc']: comm = self.hts_dir + '/hts_engine ' comm += " -td %s/tree-duration.inf "%(self.model_dir) comm += " -md %s/duration.pdf "%(self.model_dir) comm += " -tf %s/tree-lf0.inf "%(self.model_dir) comm += " -mf %s/lf0.pdf "%(self.model_dir) comm += " -tm %s/tree-%s.inf "%(self.model_dir, f) comm += " -mm %s/%s.pdf "%(self.model_dir,f) comm += " -ow ./tmp/tmp.wav" comm += " -om ./tmp/tmp.%s"%(f) comm += " -of ./tmp/tmp.lf0" ## windows: for stream in ['f', 'm']: for winfile in self.winfiles: comm += " -d%s %s "%(stream, winfile) comm += " -b %s "%(self.postfilter_coeff) ## for postfiltering comm += " -r %s "%(self.speech_rate) #comm += "-r 0.75 " #comm += "-p 240 " #comm += "-s 48000 " comm += " -u %s "%(self.vuv) #comm += " -u 0.95 " #comm += " -ow %s "%(owave) comm += " -ot %s.log "%(label) comm += " -od ./tmp/tmp.dur " comm += " %s "%(label) print comm os.system(comm) if f=='mgc': os.system('cp ./tmp/tmp.wav '+'./tmp/hts.wav') #return ### hack -- tile silences with pure silence: sils = silence_frames_from_trace(label+ '.log') fftl, ap_dim = get_world_fft_and_apdim(self.sample_rate) fz= get_speech('./tmp/tmp.lf0',1) mgc= get_speech('./tmp/tmp.mgc',self.speech_coding_config['order']+1) # 40) ap= get_speech('./tmp/tmp.bap',ap_dim) for (i,val) in enumerate(sils): if val == 1: mgc[i,:] = 0.0 fz[i] = -1.0 ap[i] = 0.0 ## #ap = np.zeros(np.shape(ap)) # var sscale: if self.scale_var != 1.0: mgc = scale_variance(mgc, scale_factor=self.scale_var) ap =np.zeros(np.shape(ap)) put_speech(fz, './tmp/tmp.lf0') put_speech(mgc, './tmp/tmp.mgc') put_speech(ap, './tmp/tmp.bap') # process parameters -- OSW todo wavesynth processor sharing config with extraction f0 = [] for f in reversed(feats): if f == "lf0": os.system(bin_dir+"/x2x +fa ./tmp/tmp."+f+" >./tmp/tmp_a."+f) f0 = loadtxt('./tmp/tmp_a.lf0') f0[f0>0]=exp(f0[f0>0]) f0[f0<=0] = 0 savetxt("./tmp/tmp_a.f0", f0.astype('float'), fmt = '%.8f') os.system(bin_dir+"/x2x +ad ./tmp/tmp_a.f0 > ./tmp/tmp_a.f0.d") else: os.system(bin_dir+"/x2x +fd ./tmp/tmp."+f+" >./tmp/tmp_d."+f) bin = self.hts_dir ## world here too alpha = self.alpha order = self.mcep_order sr = self.sample_rate ''' alpha = 0.77 order = 59 fftl = 2048 sr = 48000 ''' print 'h1' comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 ./tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > ./tmp/tmp.spec"%(bin, alpha, order, fftl, bin, bin) #comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 ./tmp/tmp.mgc | %s/sopr -d 32768.0 -P > ./tmp/tmp.spec"%(bin, alpha, order, fftl, bin, bin) os.system(comm) '''Avoid: x2x : error: input data is over the range of type 'double'! -o : clip by minimum and maximum of output data type if input data is over the range of output data type. ''' comm = "%s/synth %s %s ./tmp/tmp_a.f0.d ./tmp/tmp.spec ./tmp/tmp_d.bap ./tmp/tmp.resyn.wav"%(bin, fftl, sr) print comm os.system(comm) os.system("mv ./tmp/tmp.resyn.wav "+owave)