Exemple #1
0
def wavelet_manipulation(sequence,
                         std_scaling_factors,
                         scale_distance=0.5,
                         num_octaves=12):

    #   sequence = sequence[:512]
    #    self.scale_distance = float(self.config.get('scale_distance',0.5))
    #    self.num_octaves = int(self.config.get('num_octaves', 12))

    # capetown wavelet package setup
    s0 = 2  # first scale in number of frames
    dj = scale_distance  # distance of bands in octaves
    J = num_octaves  #number of octaves
    maxscale = len(sequence) / (
        2.0**J)  #maximum scale defined as proportion of the signal

    # perform wavelet transform, select appropriate scale
    wavelet_matrix = cwt.MexicanHat(sequence,
                                    maxscale,
                                    int(1 / scale_distance),
                                    scaling="log")

    wavelet_matrix = util.cwt_utils.scale_for_reconstruction(
        wavelet_matrix.getdata(), dj, s0)

    print 'aevbaoivaobdeiv'
    print np.shape(wavelet_matrix)

    #wavelet_matrix = wavelet_matrix.getdata()

    scales = np.transpose(wavelet_matrix)
    print np.shape(scales)

    (m, n) = np.shape(scales)
    assert len(
        std_scaling_factors
    ) == n, 'need one std scaling factor for each of %s wavelet scales' % (n)

    means = np.mean(scales, axis=0)

    stds = np.std(scales, axis=0)
    stds = np.maximum(stds, 0.0000001)  ## floor to avoid div by 0 problems

    norm_scales = (scales - means) / stds
    print np.shape(norm_scales)
    print np.mean(norm_scales, axis=0)
    print np.std(norm_scales, axis=0)

    #    norm_scales *= np.array(std_scaling_factors)

    #sys.exit(np.shape(norm_scales))
    denormed = (norm_scales * stds) + means

    recon = np.sum(scales, axis=1)

    return recon[:len(sequence)]
Exemple #2
0
    def do_training(self, speech_corpus, text_corpus):

        # fix word scale based on average word duration in corpus
        word_length_sum = 0
        word_count = 0
        for utt in speech_corpus:

            words = utt.xpath(self.level)
            assert words != [], 'Xpath %s does not match any nodes!' % (
                self.level)
            word_count += len(words)
            for w in words:
                word_length_sum += (float(w.get("end")) - float(
                    w.get("start"))) / int(self.frame_len)

        mean_word_length = word_length_sum / word_count
        #perform dummy wavelet transform to determine scale closest to average word duation
        maxscale = (1000 / (2.0**10))
        scales = cwt.MexicanHat(np.zeros(1000),
                                maxscale,
                                int(1 / self.scale_distance),
                                scaling="log").scales * 2
        self.wscale = np.abs(scales - mean_word_length).argmin() - 1

        # get variance of acoustic features
        # should get wscale variane as well
        max_utt = 100
        utt_i = 0
        self.variances = defaultdict(float)
        self.word_variances = defaultdict(float)
        for utt in speech_corpus:
            if utt_i == max_utt:
                break
            feats = self._process_acoustic_feats(utt)
            for f in self.feats:
                self.variances[f] += np.std(feats[f])

            utt_i += 1

        for f in self.feats:
            self.variances[f] /= utt_i

        print(self.variances)
Exemple #3
0
    def process_utterance(self, utt):

        ## Only apply in training, where an utterance has a waveform / audio:
        if not (utt.has_external_data("wav") or utt.has_external_data("cmp")):
            return

        # load and process acoustic features
        feats = self._process_acoustic_feats(utt)

        for f in feats:
            feats[f] = util.cwt_utils.normalize(feats[f], self.variances[f])

        # duration energy integration more usable than raw duration
#         if 'dur' in feats:
#             feats['dur'] = (feats['dur']+feats['Gain'][:len(feats['dur'])])/2.0
#
        word_nodes = utt.xpath(self.level)
        assert word_nodes != [], 'Xpath %s does not match any nodes!' % (
            self.level)
        words = self._get_durations(utt, self.level)

        # capetown wavelet package setup
        s0 = 2  # first scale in number of frames
        dj = self.scale_distance  # distance of bands in octaves
        J = self.num_octaves  #number of octaves
        maxscale = len(feats[self.fzero_feat]) / (
            2.0**J)  #maximum scale defined as proportion of the signal

        if CWT_DEBUG:
            pylab.clf()
            for f in feats:
                pylab.plot(feats[f], label=f)
                util.cwt_utils.plot_labels(words, shift=-3)
            raw_input()
            pylab.clf()

        # perform wavelet transform, select appropriate scale and calculate peak heights
        prominences = {}

        if self.dynamic_size_wavelet:

            seg_length_sum = 0
            segs = utt.xpath(self.level)
            assert segs != [], 'Xpath %s does not match any nodes!' % (
                self.level)
            seg_count = len(segs)
            for w in segs:
                seg_length_sum += (float(w.get("end")) -
                                   float(w.get("start"))) / int(self.frame_len)

            mean_seg_length = seg_length_sum / seg_count
            #perform dummy wavelet transform to determine scale closest to average word duation
            maxscale = (1000 / (2.0**10))
            scales = cwt.MexicanHat(np.zeros(1000),
                                    maxscale,
                                    int(1 / self.scale_distance),
                                    scaling="log").scales * 2
            scale_to_use = np.abs(scales - mean_seg_length).argmin() - 1
        else:
            scale_to_use = self.wscale

        i = 1
        for f in feats:
            # perform wavelet transform
            wavelet_matrix = cwt.MexicanHat(feats[f],
                                            maxscale,
                                            int(1 / self.scale_distance),
                                            scaling="log")
            wavelet_matrix = util.cwt_utils.scale_for_reconstruction(
                wavelet_matrix.getdata(), dj, s0)

            # get peaks from word scale
            #prom_param = util.cwt_utils.normalize(wavelet_matrix[wscale].astype(float))
            prom_param = wavelet_matrix[scale_to_use].astype(float)

            prominences[f] = util.cwt_utils.calc_prominence(prom_param,
                                                            words,
                                                            np.max,
                                                            use_peaks=True)

            if CWT_DEBUG:

                pylab.ylim(0, 7)  # -5,20)
                pylab.title(f)
                pylab.plot(feats[f] + i * 3, label="orig", color='gray')
                pylab.plot(prom_param + i * 3, label=f, color='red')
                util.cwt_utils.plot_labels(words)
                util.cwt_utils.plot_prom_labels(words, prominences[f], shift=i)
                #os.system("afplay %s" %utt.get("waveform"))

            if self.use_stress_track:
                stress_track = self._get_stress_track(utt)

                weighted_track = prom_param * stress_track

                if CWT_DEBUG:
                    pylab.plot(stress_track + i * 4,
                               label="stress",
                               color='green')

                    pylab.plot(weighted_track + i * 4,
                               color='blue',
                               label='weighted by stress')

            i = i + 1
        if CWT_DEBUG:
            raw_input()
        # combine measurements and add prominence attribute to words

        for i in range(len(words)):

            prominence = 0
            feat_i = 0
            for f in feats:
                prominence += prominences[f][i] * float(
                    self.prom_weights[feat_i]) * 1.2
                feat_i += 1
            if prominence > 3:
                prominence = 3
            if prominence < 0:
                prominence = 0
            # quantization for HTS decision trees, for DNNs, do not round

            word_nodes[i].set(self.output_attribute,
                              str(int(round(prominence))))