def state_durations_from_trace(fname): lines = readlist(fname) phones = [l for l in lines if 'Name' in l] phones = [l.split('-')[2].split('+')[0] for l in phones] frames = [l for l in lines if '(frames)' in l] frames = [int(re.findall('\d+',l)[0]) for l in frames ][1:] nframes =sum(frames) assert len(phones) == len(frames)/5 pairs = [] p = 0 s = 0 outdata = [] while s < len(frames): statelist = [] for i in range(5): statelist.append(frames[s]) s += 1 outdata.append(statelist) p += 1 return numpy.array(outdata)
def load_letternames(self): data = readlist(self.letter_fname) self.letternames = {} for line in data: line = line.strip(' \n') letter, pron = re.split('\s+', line, maxsplit=1) self.letternames[letter] = pron
def silence_frames_from_trace(fname): lines = readlist(fname) phones = [l for l in lines if 'Name' in l] phones = [l.split('-')[2].split('+')[0] for l in phones] frames = [l for l in lines if '(frames)' in l] frames = [int(re.findall('\d+',l)[0]) for l in frames ][1:] nframes =sum(frames) assert len(phones) == len(frames)/5 pairs = [] p = 0 s = 0 while s < len(frames): for i in range(5): pairs.append((phones[p], frames[s])) s += 1 p += 1 frames = np.zeros(nframes,dtype=int) i= 0 for (p,length) in pairs: for j in range(length): if p in ['_END_','sil']: frames[i] = 1.0 i +=1 return frames
def get_silent_feature_indices(self, question_file_name, silence_pattern): print 'get_silent_feature_indices' indices = [] questions = [q for q in readlist(question_file_name) if q != ''] questions = [q for q in questions if 'CQS' not in q] for (i, question) in enumerate(questions): if silence_pattern in question: indices.append(i) print 'silence question found:' print question return indices
def load_lexicon(self): assert os.path.isfile(self.lexicon_fname) items = readlist(self.lexicon_fname) self.entries = {} for item in items: (head, tag, pron) = item.split('\t') tag = tag.split(',') if head not in self.entries: self.entries[head] = [] self.entries[head].append((tag, pron))
def load_stream_info(self): stream_info_fname = os.path.join(self.model_dir, 'stream_info.txt') assert os.path.isfile(stream_info_fname) stream_data = readlist(stream_info_fname) stream_data = [line.split(' ') for line in stream_data] assert len(stream_data) == 4 (self.instreams, indims, self.outstreams, outdims) = stream_data indims = [int(val) for val in indims] outdims = [int(val) for val in outdims] ## note that indims are not network input, but input to acoustic preprocessing of data! assert self.outdim == sum(outdims) self.indims = dict(zip(self.instreams, indims)) self.outdims = dict(zip(self.outstreams, outdims))
def load_lexicon(self): ## assume one entry per head word -- take first if multiple assert os.path.isfile(self.lexicon_fname) items = readlist(self.lexicon_fname) self.entries = {} self.phone_inventory = [] for item in items: (head, pron) = item.split('\t') if head not in self.entries: self.entries[head] = pron phones = pron.split(' ') for phone in phones: if phone not in self.phone_inventory: self.phone_inventory.append(phone)
def load_extra_lexicon(self, extra_lex): assert os.path.isfile(extra_lex), 'not file: ' + extra_lex items = readlist(extra_lex) for item in items: if item.startswith('#') or re.match('\A\s*\Z', item): continue (head, tag, pron) = item.split('\t') tag = tag.split(',') if '|' not in pron: pron = self.syllabify(pron) if head not in self.entries: self.entries[head] = [] self.entries[head].append((tag, pron))
def process_utterance(self, utt): ## If there is no waveform attached to the utt, don't do anything: if not utt.has_attribute("waveform"): return ## Add some data to the utt structure recording the structure of the ## associated acoustic features we've produced. Do this first, in case ## we use existing features. self.stream_sizes[ 1] = '1' ## otherwise '1 1 1' for F0 TODO: fix this nicely! utt.add_acoustic_stream_info(self.feats, self.stream_sizes) ## If a feature file already exists, skip: if utt.has_external_data(self.output_filetype): ## TODO: check description against existing feats? return ## else extract features infile = utt.get("waveform") outfile = utt.get_filename(self.output_filetype) ## strip suffix .cmp:- assert outfile.endswith('.' + self.output_filetype) chars_to_strip = len(self.output_filetype) + 1 outstem = outfile[:-chars_to_strip] rate = self.rate sample_rate = self.rate alpha = self.alpha order = self.order fftl = self.fftl apsize = self.apsize frameshift_ms = self.frameshift_ms script_dir = self.voice_resources.path[c.SCRIPT] ## 1) remove wave header, downsample etc. with sox: comm = "sox -t wav " + infile comm += " -c 1 -e signed-integer " comm += " -r %s" % (rate) comm += " -b 16 " comm += " " + outstem + ".wav" comm += " dither" ## added for hi and rj data blizz 2014 success = os.system(comm) if success != 0: print 'sox failed on utterance ' + utt.get("utterance_name") return comm = "%s/analysis %s.wav %s.f0.double %s.sp.double %s.bap.double > %s.log" % ( self.tool, outstem, outstem, outstem, outstem, outstem) success = os.system(comm) # This command is very slow # print comm if success != 0: print 'world analysis failed on utterance ' + utt.get( "utterance_name") return if self.resynthesise_training_data: ## resynthesis to test comm = "%s/synth %s %s %s.f0.double %s.sp.double %s.bap.double %s.resyn.wav > %s.log" % ( self.tool, fftl, rate, outstem, outstem, outstem, outstem, outstem) success = os.system(comm) if success != 0: print 'world synthesis failed on utterance ' + utt.get( "utterance_name") return comm = "%s/x2x +df %s.sp.double | %s/sopr -R -m 32768.0 | %s/mcep -a %s -m %s -l %s -j 0 -f 0.0 -q 3 > %s.mgc" % ( self.tool, outstem, self.tool, self.tool, alpha, order, fftl, outstem) ## -e 1.0E-8 success = os.system(comm) # This command is very slow if success != 0: print 'conversion of world spectrum to mel cepstra failed on utterance ' + utt.get( "utterance_name") return for stream in ['bap']: comm = "%s/x2x +df %s.%s.double > %s.%s" % ( self.tool, outstem, stream, outstem, stream) success = os.system(comm) if success != 0: print 'double -> float conversion (stream: ' + stream + ') failed on utterance ' + utt.get( "utterance_name") return for stream in ['f0']: comm = "%s/x2x +da %s.%s.double > %s.%s.txt" % ( self.tool, outstem, stream, outstem, stream) success = os.system(comm) if success != 0: print 'double -> ascii conversion (stream: ' + stream + ') failed on utterance ' + utt.get( "utterance_name") return ## 5) F0 conversion: f0 = [float(val) for val in readlist(outstem + '.f0.txt')] log_f0 = [] for val in f0: if val == 0.0: log_f0.append('-1.0E10') else: log_f0.append(math.log(val)) writelist(log_f0, outstem + '.f0.log') comm = "%s/x2x +af %s.f0.log > %s.lf0" % (self.tool, outstem, outstem) success = os.system(comm) if success != 0: print 'writing log f0 failed on utterance ' + utt.get( "utterance_name") return ## add mcep/ap/f0 deltas: for (stream, dimen) in [('mgc', order + 1), ('bap', apsize), ('lf0', 1)]: comm = "perl %s/window.pl %s " % (script_dir, dimen) comm += "%s.%s %s > %s.%s.delta" % (outstem, stream, ' '.join( self.winfiles), outstem, stream) success = os.system(comm) # This command is very slow if success != 0: print 'delta (' + stream + ') extraction failed on utterance ' + utt.get( "utterance_name") return ### combined streams:-- ap = get_speech(outstem + '.bap.delta', apsize * len(self.winfiles)) mgc = get_speech(outstem + '.mgc.delta', (order + 1) * len(self.winfiles)) lf0 = get_speech(outstem + '.lf0.delta', 1 * len(self.winfiles)) cmp = numpy.hstack([mgc, lf0, ap]) put_speech(cmp, outfile) ## 7) add header floats_per_frame = (order + 2 + apsize) * len( self.winfiles) ## +2 for energy and F0 add_htk_header(outfile, floats_per_frame, frameshift_ms) ## 8) tidy: self.extensions_to_keep = ['.' + self.output_filetype, '.f0.txt'] ## TODO: make configuable? self.extensions_to_keep.append('.resyn.wav') self.extensions_to_keep.extend(['.mgc', '.bap', '.lf0']) keepfiles = [outstem + ending for ending in self.extensions_to_keep] for junk in glob.glob(outstem + '.*'): if not junk in keepfiles: os.remove(junk)
def main_work(): ################################################# # root is one level below this file in directory structure, ie. below the 'scripts' folder ROOT = os.path.split( os.path.realpath( os.path.abspath( os.path.dirname(inspect.getfile( inspect.currentframe())))))[0] + '/' dirs = { 'ROOT': ROOT, 'CONFIG': ROOT + "configs/", 'VOICES': ROOT + "voices/", 'TRAIN': ROOT + "train/", 'RULES': ROOT + "rules/", 'CORPUS': ROOT + "corpus/", 'BIN': ROOT + "/tools/bin/" } # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-s', dest='speaker', required=True, \ help= "the name of the speaker: <ROOT>/corpus/<LANG>/<SPEAKER>") a.add_argument('-l', dest='lang', required=True, \ help= "the language of the speaker: <ROOT>/corpus/<LANG>") a.add_argument('-o', dest='output', required=False, default=False, \ help= "output audio here") a.add_argument('-t', dest='stage', required=False, default="runtime", \ help=""" defines the current usage stage (definitions of stages should by found in <config>/recipe.cfg""") a.add_argument('-u', dest='output_utt', required=False, default=False, \ help= "output utt files here") a.add_argument('-play', dest='play', action="store_true", required=False, default=False, \ help=" play audio after synthesis") a.add_argument('-lab', dest='make_label', action="store_true", default=False, \ help= "make label file as well as wave in output location") a.add_argument('config', help="""configuration to use: naive, semi-naive, gold, as defined in <ROOT>/recipes/<config> -directory""" ) a.add_argument('files', nargs='*', help="text files to speak, reading from stdin by default") opts = a.parse_args() voice_location = os.path.join(dirs['VOICES'], opts.lang, opts.speaker, opts.config) train_location = os.path.join(dirs['TRAIN'], opts.lang, "speakers", opts.speaker, opts.config) config_path = os.path.join(dirs['CONFIG'], opts.config) voice_config = os.path.join(config_path, fname.RECIPE) ## Make Voice object to contain voice elements trained on this corpus: voice = Voice(opts.speaker, opts.lang, opts.config, opts.stage, dirs) if not opts.output: output_dir = os.path.join(voice_location, 'output', 'wav') else: output_dir = opts.output # if not opts.output: # output_labfile = None # else: # output_labfile = output_wavefile.replace('.wav', '.lab') if opts.output_utt: if not os.path.isdir(opts.output_utt): os.makedirs(opts.output_utt) output_extensions = [] if opts.make_label: output_extensions.append('dnn_lab') for filename in opts.files: base = get_basename(filename) output_wavefile = os.path.join(output_dir, base + '.wav') text = ' '.join(readlist(filename)) try: print(text) except: print(' ') ## weird characgers print(base) if opts.output_utt: voice.synth_utterance(text, output_wavefile=output_wavefile, \ output_uttfile=os.path.join(opts.output_utt, base + '.utt'),\ output_extensions=output_extensions) else: voice.synth_utterance(text, output_wavefile=output_wavefile, \ output_extensions=output_extensions)
def load_onsets(self): onsets = readlist(self.onsets_fname) onsets = [tuple(line.split(' ')) for line in onsets] self.onsets = dict(zip(onsets, onsets))