def wavelet_manipulation(sequence, std_scaling_factors, scale_distance=0.5, num_octaves=12): # sequence = sequence[:512] # self.scale_distance = float(self.config.get('scale_distance',0.5)) # self.num_octaves = int(self.config.get('num_octaves', 12)) # capetown wavelet package setup s0 = 2 # first scale in number of frames dj = scale_distance # distance of bands in octaves J = num_octaves #number of octaves maxscale = len(sequence) / ( 2.0**J) #maximum scale defined as proportion of the signal # perform wavelet transform, select appropriate scale wavelet_matrix = cwt.MexicanHat(sequence, maxscale, int(1 / scale_distance), scaling="log") wavelet_matrix = util.cwt_utils.scale_for_reconstruction( wavelet_matrix.getdata(), dj, s0) print 'aevbaoivaobdeiv' print np.shape(wavelet_matrix) #wavelet_matrix = wavelet_matrix.getdata() scales = np.transpose(wavelet_matrix) print np.shape(scales) (m, n) = np.shape(scales) assert len( std_scaling_factors ) == n, 'need one std scaling factor for each of %s wavelet scales' % (n) means = np.mean(scales, axis=0) stds = np.std(scales, axis=0) stds = np.maximum(stds, 0.0000001) ## floor to avoid div by 0 problems norm_scales = (scales - means) / stds print np.shape(norm_scales) print np.mean(norm_scales, axis=0) print np.std(norm_scales, axis=0) # norm_scales *= np.array(std_scaling_factors) #sys.exit(np.shape(norm_scales)) denormed = (norm_scales * stds) + means recon = np.sum(scales, axis=1) return recon[:len(sequence)]
def do_training(self, speech_corpus, text_corpus): # fix word scale based on average word duration in corpus word_length_sum = 0 word_count = 0 for utt in speech_corpus: words = utt.xpath(self.level) assert words != [], 'Xpath %s does not match any nodes!' % ( self.level) word_count += len(words) for w in words: word_length_sum += (float(w.get("end")) - float( w.get("start"))) / int(self.frame_len) mean_word_length = word_length_sum / word_count #perform dummy wavelet transform to determine scale closest to average word duation maxscale = (1000 / (2.0**10)) scales = cwt.MexicanHat(np.zeros(1000), maxscale, int(1 / self.scale_distance), scaling="log").scales * 2 self.wscale = np.abs(scales - mean_word_length).argmin() - 1 # get variance of acoustic features # should get wscale variane as well max_utt = 100 utt_i = 0 self.variances = defaultdict(float) self.word_variances = defaultdict(float) for utt in speech_corpus: if utt_i == max_utt: break feats = self._process_acoustic_feats(utt) for f in self.feats: self.variances[f] += np.std(feats[f]) utt_i += 1 for f in self.feats: self.variances[f] /= utt_i print(self.variances)
def process_utterance(self, utt): ## Only apply in training, where an utterance has a waveform / audio: if not (utt.has_external_data("wav") or utt.has_external_data("cmp")): return # load and process acoustic features feats = self._process_acoustic_feats(utt) for f in feats: feats[f] = util.cwt_utils.normalize(feats[f], self.variances[f]) # duration energy integration more usable than raw duration # if 'dur' in feats: # feats['dur'] = (feats['dur']+feats['Gain'][:len(feats['dur'])])/2.0 # word_nodes = utt.xpath(self.level) assert word_nodes != [], 'Xpath %s does not match any nodes!' % ( self.level) words = self._get_durations(utt, self.level) # capetown wavelet package setup s0 = 2 # first scale in number of frames dj = self.scale_distance # distance of bands in octaves J = self.num_octaves #number of octaves maxscale = len(feats[self.fzero_feat]) / ( 2.0**J) #maximum scale defined as proportion of the signal if CWT_DEBUG: pylab.clf() for f in feats: pylab.plot(feats[f], label=f) util.cwt_utils.plot_labels(words, shift=-3) raw_input() pylab.clf() # perform wavelet transform, select appropriate scale and calculate peak heights prominences = {} if self.dynamic_size_wavelet: seg_length_sum = 0 segs = utt.xpath(self.level) assert segs != [], 'Xpath %s does not match any nodes!' % ( self.level) seg_count = len(segs) for w in segs: seg_length_sum += (float(w.get("end")) - float(w.get("start"))) / int(self.frame_len) mean_seg_length = seg_length_sum / seg_count #perform dummy wavelet transform to determine scale closest to average word duation maxscale = (1000 / (2.0**10)) scales = cwt.MexicanHat(np.zeros(1000), maxscale, int(1 / self.scale_distance), scaling="log").scales * 2 scale_to_use = np.abs(scales - mean_seg_length).argmin() - 1 else: scale_to_use = self.wscale i = 1 for f in feats: # perform wavelet transform wavelet_matrix = cwt.MexicanHat(feats[f], maxscale, int(1 / self.scale_distance), scaling="log") wavelet_matrix = util.cwt_utils.scale_for_reconstruction( wavelet_matrix.getdata(), dj, s0) # get peaks from word scale #prom_param = util.cwt_utils.normalize(wavelet_matrix[wscale].astype(float)) prom_param = wavelet_matrix[scale_to_use].astype(float) prominences[f] = util.cwt_utils.calc_prominence(prom_param, words, np.max, use_peaks=True) if CWT_DEBUG: pylab.ylim(0, 7) # -5,20) pylab.title(f) pylab.plot(feats[f] + i * 3, label="orig", color='gray') pylab.plot(prom_param + i * 3, label=f, color='red') util.cwt_utils.plot_labels(words) util.cwt_utils.plot_prom_labels(words, prominences[f], shift=i) #os.system("afplay %s" %utt.get("waveform")) if self.use_stress_track: stress_track = self._get_stress_track(utt) weighted_track = prom_param * stress_track if CWT_DEBUG: pylab.plot(stress_track + i * 4, label="stress", color='green') pylab.plot(weighted_track + i * 4, color='blue', label='weighted by stress') i = i + 1 if CWT_DEBUG: raw_input() # combine measurements and add prominence attribute to words for i in range(len(words)): prominence = 0 feat_i = 0 for f in feats: prominence += prominences[f][i] * float( self.prom_weights[feat_i]) * 1.2 feat_i += 1 if prominence > 3: prominence = 3 if prominence < 0: prominence = 0 # quantization for HTS decision trees, for DNNs, do not round word_nodes[i].set(self.output_attribute, str(int(round(prominence))))