def build_aa_dataset(in_samples, out_samples, shift, n_train=100, n_valid=10):
    aa_seqs = np.load('/data/lisa/data/timit/readable/per_phone/wav_aa.npy')
    
    mean = np.mean(np.hstack(aa_seqs))
    std = np.std(np.hstack(aa_seqs))
    print "mean:%f , std:%f"%(mean,std)
    aa_max,aa_min = np.max(np.hstack(aa_seqs)), np.min(np.hstack(aa_seqs))

    norm_seqs = np.asarray([(seq.astype('float32')-mean)/std \
                            for seq in aa_seqs])
    # n_seq = norm_seqs.shape[0]
    # n_train = n_seq*9/10
    # train_aa_seqs = norm_seqs[:n_train]
    # valid_aa_seqs = norm_seqs[n_train:]

    # n_train = 100
    # n_valid = 10
    train_aa_seqs = norm_seqs[:n_train]
    valid_aa_seqs = norm_seqs[n_train:n_train+n_valid]
    
    print 'train sequences:', train_aa_seqs.shape[0]
    print 'valid sequences:', valid_aa_seqs.shape[0]

    frame_len = in_samples + out_samples
    overlap = frame_len - shift
    
    train_samples = []
    valid_samples = []
    
    for wav_seq in train_aa_seqs:
        train_samples.append(segment_axis(wav_seq, frame_len, overlap))
    train_samples = np.vstack(train_samples[:])

    np.random.seed(123)
    train_samples = np.random.permutation(train_samples)
    
    for wav_seq in valid_aa_seqs:
        valid_samples.append(segment_axis(wav_seq, frame_len, overlap))
    valid_samples = np.vstack(valid_samples[:])
        
    print 'train examples:', train_samples.shape
    print 'valid examples:', valid_samples.shape
    train_x = train_samples[:,:in_samples]
    train_y = train_samples[:,in_samples:]
    print train_x.shape, train_y.shape

    valid_x = valid_samples[:,:in_samples]
    valid_y = valid_samples[:,in_samples:]
    print valid_x.shape, valid_y.shape


    return utils.shared_dataset(train_x), \
           utils.shared_dataset(train_y), \
           utils.shared_dataset(valid_x), \
           utils.shared_dataset(valid_y)
Beispiel #2
0
def get_emph_spec(audio, nperseg=256, noverlap = 96, nfft=512, fs=16000):
	# Function to generate the emphasized spectrogram
	prefac = 0.97
	w = hamming(nperseg, sym=0)
	extract = preemp(audio, prefac)
	framed = segment_axis(extract, nperseg, noverlap) * w
	spec = np.abs(fft(framed, nfft, axis=-1))
	return spec
Beispiel #3
0
def pro_signal(signal, window='hanning', frame_len=1024, overlap=512):
    if window == 'hanning':
        # w = np.hanning(frame_len)
        w = sqrt_hann(frame_len)
    else:
        w = window
    y = segment_axis(signal, frame_len, overlap=overlap,
                     end='cut')  # use cut instead of pad
    y = w * y
    return y
Beispiel #4
0
 def frames(self, utterance, framelen, overlap):
     phtimes = self.phone_times(utterance)
     s = self.samples(utterance)[1]
     uttfr = []
     uttph = []
     for p in phtimes:
         if p[2] - p[1] > framelen:
             uttfr.append(segment_axis(s[p[1]:p[2]], framelen, overlap))
             uttph.append(list(itertools.repeat(p[0], uttfr[-1].shape[0])))
     return np.vstack(uttfr), list(itertools.chain(*uttph))
Beispiel #5
0
def build_data_sets(frame_len):
    """builds data sets for training/validating/testing the models"""
    
    print 'loading data...'

    save_stdout = sys.stdout
    sys.stdout = open('timit.log', 'w')

    # creating wrapper object for TIMIT dataset
    dataset = TIMIT()
    dataset.load("train")
    
    sys.stdout = save_stdout

    overlap = frame_len - 1

    wav_seqs = dataset.train_raw_wav[0:10]
    norm_seqs = utils.normalize(wav_seqs)
    
    # Segment into frames
    samples = map(lambda seq: segment_axis(seq, frame_len, overlap),
                  norm_seqs)

    # stack all data in one matrix, each row is a frame
    data = np.vstack(samples)
    # shuffle the frames so we can assume data is IID
    np.random.seed(123)
    data = np.random.permutation(data)

    # take 10% for test, 10% for valid, and 80% for training
    chunk = data.shape[0] / 10
    # now split data to x and y for train, valid, and test
    train_x = data[:8*chunk,:-1]
    train_y = data[:8*chunk,-1]
    valid_x = data[8*chunk:9*chunk,:-1]
    valid_y = data[8*chunk:9*chunk,-1]
    test_x = data[9*chunk:,:-1]
    test_y = data[9*chunk:,-1]

    print 'train_x shape', train_x.shape
    print 'train_y shape', train_y.shape

    print 'Done'
    print 'There are %d training samples'%train_x.shape[0]
    print 'There are %d validation samples'%valid_x.shape[0]
    
    return utils.shared_dataset_xy((train_x,train_y)),\
           utils.shared_dataset_xy((valid_x,valid_y)),\
           utils.shared_dataset_xy((test_x,test_y))
Beispiel #6
0
def build_data_sets(frame_len):
    """builds data sets for training/validating/testing the models"""

    print 'loading data...'

    save_stdout = sys.stdout
    sys.stdout = open('timit.log', 'w')

    # creating wrapper object for TIMIT dataset
    dataset = TIMIT()
    dataset.load("train")

    sys.stdout = save_stdout

    overlap = frame_len - 1

    wav_seqs = dataset.train_raw_wav[0:10]
    norm_seqs = utils.normalize(wav_seqs)

    # Segment into frames
    samples = map(lambda seq: segment_axis(seq, frame_len, overlap), norm_seqs)

    # stack all data in one matrix, each row is a frame
    data = np.vstack(samples)
    # shuffle the frames so we can assume data is IID
    np.random.seed(123)
    data = np.random.permutation(data)

    # take 10% for test, 10% for valid, and 80% for training
    chunk = data.shape[0] / 10
    # now split data to x and y for train, valid, and test
    train_x = data[:8 * chunk, :-1]
    train_y = data[:8 * chunk, -1]
    valid_x = data[8 * chunk:9 * chunk, :-1]
    valid_y = data[8 * chunk:9 * chunk, -1]
    test_x = data[9 * chunk:, :-1]
    test_y = data[9 * chunk:, -1]

    print 'train_x shape', train_x.shape
    print 'train_y shape', train_y.shape

    print 'Done'
    print 'There are %d training samples' % train_x.shape[0]
    print 'There are %d validation samples' % valid_x.shape[0]

    return utils.shared_dataset_xy((train_x,train_y)),\
           utils.shared_dataset_xy((valid_x,valid_y)),\
           utils.shared_dataset_xy((test_x,test_y))
Beispiel #7
0
    def get_markov_frames(self, subset, id):
        """
        Given the subset and an id, this method returns the list [input_frames, 
        input_phonemes, input_words, output_phoneme, output_word, spkr_info, 
        output_frame, ending_phoneme, ending_word]. 
        
        """
        assert subset + "_intervals_seq" in self.__dict__.keys()
        assert id < self.__dict__[subset + "_intervals_seq"][-1]

        n_frames_in = self.__dict__[subset + "_n_frames_in"]
        frame_length = self.__dict__[subset + "_frame_length"]
        overlap = self.__dict__[subset + "_overlap"]
        wav_length = self.__dict__[subset + "_wav_length"]
        intervals_seq = self.__dict__[subset + "_intervals_seq"]

        # Find the acoustic samples sequence we are looking for
        seq_id = np.digitize([id], intervals_seq) - 1
        seq_id = seq_id[0]

        # Find the position in this sequence
        idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \
                     + overlap)

        # Get the sequence
        wav_seq = self.__dict__[subset + "_raw_wav"][seq_id]

        # Get the phonemes
        phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0]
        phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1]
        phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end]
        phn_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any phoneme so 0 is
        # the index for "NO_PHONEME" and the other index are shifted by one
        for (phn_start, phn_end, phn) in phn_start_end:
            phn_seq[phn_start:phn_end] = phn + 1

        # Get the words
        wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0]
        wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1]
        wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end]
        wrd_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any word so 0 is
        # the index for "NO_WORD" and the other index are shifted by one
        for (wrd_start, wrd_end, wrd) in wrd_start_end:
            wrd_seq[wrd_start:wrd_end] = wrd + 1

        # Binary variable announcing the end of the word or phoneme
        end_phn = np.zeros_like(phn_seq)
        end_wrd = np.zeros_like(wrd_seq)

        for i in range(len(phn_seq) - 1):
            if phn_seq[i] != phn_seq[i + 1]:
                end_phn[i] = 1
            if wrd_seq[i] != wrd_seq[i + 1]:
                end_wrd[i] = 1

        end_phn[-1] = 1
        end_wrd[-1] = 1

        # Find the speaker id
        spkr_id = self.__dict__[subset + "_spkr"][seq_id]
        # Find the speaker info
        spkr_info = self.spkrinfo[spkr_id]

        # Pick the selected segment
        padded_wav_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)]

        padded_phn_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)]

        padded_wrd_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)]

        # Segment into frames
        wav_seq = segment_axis(padded_wav_seq, frame_length, overlap)

        # Take the most occurring phoneme in a sequence
        phn_seq = segment_axis(padded_phn_seq, frame_length, overlap)
        phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
        phn_seq = np.asarray(phn_seq, dtype='int')

        # Take the most occurring word in a sequence
        wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap)
        wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
        wrd_seq = np.asarray(wrd_seq, dtype='int')

        # Announce the end if and only if it was announced in the current frame
        end_phn = segment_axis(end_phn, frame_length, overlap)
        end_phn = end_phn.max(axis=1)
        end_wrd = segment_axis(end_wrd, frame_length, overlap)
        end_wrd = end_wrd.max(axis=1)

        # Put names on the output
        input_frames = wav_seq[:-1]
        input_phonemes = phn_seq[:-1]
        input_words = wrd_seq[:-1]
        output_phoneme = phn_seq[-1]
        output_word = wrd_seq[-1]
        output_frame = wav_seq[-1]
        ending_phoneme = end_phn[-1]
        ending_word = end_wrd[-1]

        return [input_frames, input_phonemes, input_words, output_phoneme, \
                output_word, spkr_info, output_frame, ending_phoneme, \
                ending_word]
Beispiel #8
0
    def get_raw_seq(self, subset, seq_id, frame_length, overlap):
        """
        Given the id of the subset, the id of the sequence, the frame length and 
        the overlap between frames, this method will return a frames sequence 
        from a given set, the associated phonemes and words sequences (including 
        a binary variable indicating change) and the information vector on the 
        speaker.
        
        """
        self.check_subset_value(subset)
        self.check_subset_presence(subset)

        # Check if the id is valid
        n_seq = self.__dict__[subset + "_n_seq"]
        if seq_id >= n_seq:
            raise ValueError("This sequence does not exist.")

        import pdb
        pdb.set_trace()

        # Get the sequence
        wav_seq = self.__dict__[subset + "_raw_wav"][seq_id]

        # Get the phonemes
        phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0]
        phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1]
        phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end]
        phn_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any phoneme so 0 is
        # the index for "NO_PHONEME" and the other index are shifted by one
        for (phn_start, phn_end, phn) in phn_start_end:
            phn_seq[phn_start:phn_end] = phn + 1

        # Get the words
        wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0]
        wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1]
        wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end]
        wrd_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any word so 0 is
        # the index for "NO_WORD" and the other index are shifted by one
        for (wrd_start, wrd_end, wrd) in wrd_start_end:
            wrd_seq[wrd_start:wrd_end] = wrd + 1

        import pdb
        pdb.set_trace()

        # Binary variable announcing the end of the word or phoneme
        end_phn = np.zeros_like(phn_seq)
        end_wrd = np.zeros_like(wrd_seq)

        for i in range(len(phn_seq) - 1):
            if phn_seq[i] != phn_seq[i + 1]:
                end_phn[i] = 1
            if wrd_seq[i] != wrd_seq[i + 1]:
                end_wrd[i] = 1

        end_phn[-1] = 1
        end_wrd[-1] = 1

        import pdb
        pdb.set_trace()

        # Find the speaker id
        spkr_id = self.__dict__[subset + "_spkr"][seq_id]
        # Find the speaker info
        spkr_info = self.spkrinfo[spkr_id]

        # Segment into frames
        wav_seq = segment_axis(wav_seq, frame_length, overlap)

        # Take the most occurring phoneme in a frame
        phn_seq = segment_axis(phn_seq, frame_length, overlap)
        phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
        phn_seq = np.asarray(phn_seq, dtype='int')

        # Take the most occurring word in a frame
        wrd_seq = segment_axis(wrd_seq, frame_length, overlap)
        wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
        wrd_seq = np.asarray(wrd_seq, dtype='int')

        # Announce the end if and only if it was announced in the current frame
        end_phn = segment_axis(end_phn, frame_length, overlap)
        end_phn = end_phn.max(axis=1)
        end_wrd = segment_axis(end_wrd, frame_length, overlap)
        end_wrd = end_wrd.max(axis=1)

        return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13):
    """Compute Mel Frequency Cepstral Coefficients.

    Parameters
    ----------
    input: ndarray
        input from which the coefficients are computed

    Returns
    -------
    ceps: ndarray
        Mel-cepstrum coefficients
    mspec: ndarray
        Log-spectrum in the mel-domain.

    Notes
    -----
    MFCC are computed as follows:
        * Pre-processing in time-domain (pre-emphasizing)
        * Compute the spectrum amplitude by windowing with a Hamming window
        * Filter the signal in the spectral domain with a triangular
        filter-bank, whose filters are approximatively linearly spaced on the
        mel scale, and have equal bandwith in the mel scale
        * Compute the DCT of the log-spectrum

    References
    ----------
    .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric
           representations for monosyllabic word recognition in continuously
           spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc.
           ASSP-28 (4): 357-366, August 1980."""

    # MFCC parameters: taken from auditory toolbox
    over = nwin - 160
    # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
    # radiation at the lips level)
    prefac = 0.97

    #lowfreq = 400 / 3.
    lowfreq = 133.33
    #highfreq = 6855.4976
    linsc = 200 / 3.
    logsc = 1.0711703

    nlinfil = 13
    nlogfil = 27
    nfil = nlinfil + nlogfil

    w = hamming(nwin, sym=0)

    fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0]

    #------------------
    # Compute the MFCC
    #------------------
    extract = preemp(input, prefac)
    framed = segment_axis(extract, nwin, over) * w

    # Compute the spectrum magnitude
    spec = np.abs(fft(framed, nfft, axis=-1))
    # Filter the spectrum through the triangle filterbank
    mspec = np.log10(np.dot(spec, fbank.T))
    # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
    ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps]

    return ceps, mspec, spec
Beispiel #10
0
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13):
    """Compute Mel Frequency Cepstral Coefficients.

    Parameters
    ----------
    input: ndarray
        input from which the coefficients are computed

    Returns
    -------
    ceps: ndarray
        Mel-cepstrum coefficients
    mspec: ndarray
        Log-spectrum in the mel-domain.

    Notes
    -----
    MFCC are computed as follows:
        * Pre-processing in time-domain (pre-emphasizing)
        * Compute the spectrum amplitude by windowing with a Hamming window
        * Filter the signal in the spectral domain with a triangular
        filter-bank, whose filters are approximatively linearly spaced on the
        mel scale, and have equal bandwith in the mel scale
        * Compute the DCT of the log-spectrum

    References
    ----------
    .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric
           representations for monosyllabic word recognition in continuously
           spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc.
           ASSP-28 (4): 357-366, August 1980."""

    # MFCC parameters: taken from auditory toolbox
    over = nwin - 160
    # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
    # radiation at the lips level)
    prefac = 0.97

    #lowfreq = 400 / 3.
    lowfreq = 133.33
    #highfreq = 6855.4976
    linsc = 200/3.
    logsc = 1.0711703

    nlinfil = 13
    nlogfil = 27
    nfil = nlinfil + nlogfil

    w = hamming(nwin, sym=0)

    fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0]

    #------------------
    # Compute the MFCC
    #------------------
    extract = preemp(input, prefac)
    framed = segment_axis(extract, nwin, over) * w

    # Compute the spectrum magnitude
    spec = np.abs(fft(framed, nfft, axis=-1))
    # Filter the spectrum through the triangle filterbank
    mspec = np.log10(np.dot(spec, fbank.T))
    # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
    ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps]

    return ceps, mspec, spec
Beispiel #11
0
 def get_markov_frames(self, subset, id):
     """
     Given the subset and an id, this method returns the list [input_frames, 
     input_phonemes, input_words, output_phoneme, output_word, spkr_info, 
     output_frame, ending_phoneme, ending_word]. 
     
     """
     assert subset+"_intervals_seq" in self.__dict__.keys()
     assert id < self.__dict__[subset+"_intervals_seq"][-1]
     
     n_frames_in = self.__dict__[subset+"_n_frames_in"]
     frame_length = self.__dict__[subset+"_frame_length"]
     overlap = self.__dict__[subset+"_overlap"]
     wav_length = self.__dict__[subset+"_wav_length"]
     intervals_seq = self.__dict__[subset+"_intervals_seq"]
     
     # Find the acoustic samples sequence we are looking for
     seq_id = np.digitize([id], intervals_seq) - 1
     seq_id = seq_id[0]
     
     # Find the position in this sequence
     idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \
                  + overlap)
         
     
     # Get the sequence
     wav_seq = self.__dict__[subset+"_raw_wav"][seq_id]
     
     # Get the phonemes
     phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0]
     phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1]
     phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end]
     phn_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any phoneme so 0 is 
     # the index for "NO_PHONEME" and the other index are shifted by one
     for (phn_start, phn_end, phn) in phn_start_end:
         phn_seq[phn_start:phn_end] = phn+1
     
     # Get the words
     wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0]
     wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1]
     wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end]
     wrd_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any word so 0 is 
     # the index for "NO_WORD" and the other index are shifted by one
     for (wrd_start, wrd_end, wrd) in wrd_start_end:
         wrd_seq[wrd_start:wrd_end] = wrd+1
     
     # Binary variable announcing the end of the word or phoneme
     end_phn = np.zeros_like(phn_seq)
     end_wrd = np.zeros_like(wrd_seq)
     
     for i in range(len(phn_seq) - 1):
         if phn_seq[i] != phn_seq[i+1]:
             end_phn[i] = 1
         if wrd_seq[i] != wrd_seq[i+1]:
             end_wrd[i] = 1
     
     end_phn[-1] = 1
     end_wrd[-1] = 1
     
     # Find the speaker id
     spkr_id = self.__dict__[subset+"_spkr"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Pick the selected segment
     padded_wav_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     padded_phn_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     padded_wrd_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     # Segment into frames
     wav_seq = segment_axis(padded_wav_seq, frame_length, overlap)
     
     # Take the most occurring phoneme in a sequence
     phn_seq = segment_axis(padded_phn_seq, frame_length, overlap)
     phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
     phn_seq = np.asarray(phn_seq, dtype='int')
     
     # Take the most occurring word in a sequence
     wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap)
     wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
     wrd_seq = np.asarray(wrd_seq, dtype='int')
     
     # Announce the end if and only if it was announced in the current frame
     end_phn = segment_axis(end_phn, frame_length, overlap)
     end_phn = end_phn.max(axis=1)
     end_wrd = segment_axis(end_wrd, frame_length, overlap)
     end_wrd = end_wrd.max(axis=1)
     
     # Put names on the output
     input_frames = wav_seq[:-1]
     input_phonemes = phn_seq[:-1]
     input_words = wrd_seq[:-1]
     output_phoneme = phn_seq[-1]
     output_word = wrd_seq[-1]
     output_frame = wav_seq[-1]
     ending_phoneme = end_phn[-1]
     ending_word = end_wrd[-1]
     
     return [input_frames, input_phonemes, input_words, output_phoneme, \
             output_word, spkr_info, output_frame, ending_phoneme, \
             ending_word]
Beispiel #12
0
    def get_raw_seq(self, subset, seq_id, frame_length, overlap):
        """
        Given the id of the subset, the id of the sequence, the frame length and 
        the overlap between frames, this method will return a frames sequence 
        from a given set, the associated phonemes and words sequences (including 
        a binary variable indicating change) and the information vector on the 
        speaker.
        
        """
        self.check_subset_value(subset)
        self.check_subset_presence(subset)
        
        # Check if the id is valid
        n_seq = self.__dict__[subset+"_n_seq"]
        if seq_id >= n_seq:
            raise ValueError("This sequence does not exist.")

        import pdb; pdb.set_trace()

            
        # Get the sequence
        wav_seq = self.__dict__[subset+"_raw_wav"][seq_id]
        
        # Get the phonemes
        phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0]
        phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1]
        phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end]
        phn_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any phoneme so 0 is 
        # the index for "NO_PHONEME" and the other index are shifted by one
        for (phn_start, phn_end, phn) in phn_start_end:
            phn_seq[phn_start:phn_end] = phn+1

                
        # Get the words
        wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0]
        wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1]
        wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end]
        wrd_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any word so 0 is 
        # the index for "NO_WORD" and the other index are shifted by one
        for (wrd_start, wrd_end, wrd) in wrd_start_end:
            wrd_seq[wrd_start:wrd_end] = wrd+1

        import pdb; pdb.set_trace()

        # Binary variable announcing the end of the word or phoneme
        end_phn = np.zeros_like(phn_seq)
        end_wrd = np.zeros_like(wrd_seq)
        
        for i in range(len(phn_seq) - 1):
            if phn_seq[i] != phn_seq[i+1]:
                end_phn[i] = 1
            if wrd_seq[i] != wrd_seq[i+1]:
                end_wrd[i] = 1
        
        end_phn[-1] = 1
        end_wrd[-1] = 1

        import pdb; pdb.set_trace()
        
        # Find the speaker id
        spkr_id = self.__dict__[subset+"_spkr"][seq_id]
        # Find the speaker info
        spkr_info = self.spkrinfo[spkr_id]
        
        # Segment into frames
        wav_seq = segment_axis(wav_seq, frame_length, overlap)

                
        # Take the most occurring phoneme in a frame
        phn_seq = segment_axis(phn_seq, frame_length, overlap)
        phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
        phn_seq = np.asarray(phn_seq, dtype='int')

                
        # Take the most occurring word in a frame
        wrd_seq = segment_axis(wrd_seq, frame_length, overlap)
        wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
        wrd_seq = np.asarray(wrd_seq, dtype='int')

                
        # Announce the end if and only if it was announced in the current frame
        end_phn = segment_axis(end_phn, frame_length, overlap)
        end_phn = end_phn.max(axis=1)
        end_wrd = segment_axis(end_wrd, frame_length, overlap)
        end_wrd = end_wrd.max(axis=1)

        
        
        
        return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
Beispiel #13
0
def _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns,
                        in_samples, out_samples, shift,
                        win_width, shuffle):
        
    #import pdb; pdb.set_trace()
    norm_seqs = utils.standardize(wav_seqs)
    #norm_seqs = utils.normalize(wav_seqs)
    
    frame_len = in_samples + out_samples
    overlap = frame_len - shift
    
    samples = []
    seqs_phn_info = []
    seqs_phn_shift = []

        
    # CAUTION!: I am using here reduced phone set
    # we can also try using the full set but we must store phn+1
    # because 0 no more refers to 'h#' (no speech)

    for ind in range(len(norm_seqs)):
        #import pdb; pdb.set_trace()
        wav_seq = norm_seqs[ind]
        phn_seq = seqs_to_phns[ind]
        phn_start_end = dataset.__dict__[subset+"_phn"][phn_seq[0]:phn_seq[1]]

        # create a matrix with consecutive windows
        # phones are padded by h#, because each window will be shifted once
        # the first phone samples has passed

        phones = np.append(phn_start_end[:,2].astype('int16'),
                           np.zeros((1,),dtype='int16'))
        # phones = np.append(phn_start_end[:,2],
        #                    np.zeros((1,)))

        phn_windows = segment_axis(phones, win_width, win_width-1)

        # array that has endings of each phone
        phn_ends = phn_start_end[:,1]
        # extend the last phone till the end, this is not wrong as long as the
        # last phone is no speech phone (h#)
        phn_ends[-1] = wav_seq.shape[0]-1

        # create a mapping from each sample to phn_window
        phn_win_shift = np.zeros_like(wav_seq,dtype='int16')
        phn_win_shift[phn_ends] = 1
        phn_win = phn_win_shift.cumsum(dtype='int16')
        # minor correction!
        phn_win[-1] = phn_win[-2]

        # Segment samples into frames
        samples.append(segment_axis(wav_seq, frame_len, overlap))

        # for phones we care only about one value to mark the start of a new window.
        # the start of a phone window in a frame is when all samples of previous
        # phone hav passed, so we use 'min' function to choose the current phone
        # of the frame
        phn_frames = segment_axis(phn_win, frame_len, overlap).min(axis=1)
        # replace the window index with the window itself
        win_frames = phn_windows[phn_frames]
        seqs_phn_info.append(win_frames)

        #import pdb; pdb.set_trace()
        # create a window shift for each frame
        shift_frames_aux = np.roll(phn_frames,1)
        shift_frames_aux[0] = 0
        shift_frames = phn_frames - shift_frames_aux
        # to mark the ending of the sequence - countering the first correction!
        shift_frames[-1] = 1
        seqs_phn_shift.append(shift_frames)
        #import pdb; pdb.set_trace()
    
        
    #import pdb; pdb.set_trace()
    # stack all data in one matrix, each row is a frame
    samples_data = np.vstack(samples[:])
    phn_data = np.vstack(seqs_phn_info[:])
    shift_data = np.hstack(seqs_phn_shift[:])

    
    #convert phone data to one-hot
    from pylearn2.format.target_format import OneHotFormatter
    fmt = OneHotFormatter(max_labels=39, dtype='float32')
    
    phn_data = fmt.format(phn_data)
    phn_data = phn_data.reshape(phn_data.shape[0],
                                phn_data.shape[1]*phn_data.shape[2])
    
    full_data = np.hstack([samples_data[:,:in_samples], phn_data, #input
                           samples_data[:,in_samples:], #out1
                           shift_data.reshape(shift_data.shape[0],1)]) #out2
    
    if shuffle:
        np.random.seed(123)
        full_data = np.random.permutation(full_data)

    
    data_x = full_data[:,:in_samples+win_width*39]
    data_y1 = full_data[:,in_samples+win_width*39:-1]
    data_y2 = full_data[:,-1]
    
        
    print 'Done'
    print 'There are %d examples in %s set'%(data_x.shape[0],subset)

    print "--------------"
    print 'data_x.shape', data_x.shape
    print 'data_y1.shape', data_y1.shape
    
    return utils.shared_dataset(data_x), \
           utils.shared_dataset(data_y1),\
           utils.shared_dataset(data_y2)
    from scipy.io import wavfile
    from scikits.talkbox import segment_axis
    resolution = 160
    step = 8
    b = 1.019
    n_channels = 64
    overlap = 80
    
    # Compute a multiscale dictionary
    
    D_multi = np.r_[tuple(gammatone_matrix(b, fc, resolution, step) for
                          fc in erb_space(150, 8000, n_channels))]

    # Load test signal
    fs, y = wavfile.read('/home/jfsantos/data/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV')
    y = y / 2.0**15
    Y = segment_axis(y, resolution, overlap=overlap, end='pad')
    Y = np.hanning(resolution) * Y

    # segments should be windowed and overlap
    
    coder = SparseCoder(dictionary=D_multi, transform_n_nonzero_coefs=None, transform_alpha=1., transform_algorithm='omp')
    X = coder.transform(Y)
    density = len(np.flatnonzero(X))
    out= np.zeros((np.ceil(len(y)/resolution)+1)*resolution)
    for k in range(0, len(X)):
        idx = range(k*(resolution-overlap),k*(resolution-overlap) + resolution)
        out[idx] += np.dot(X[k], D_multi)
    squared_error = np.sum((y - out[0:len(y)]) ** 2)
    wavfile.write('reconst_%d_%d.wav'%(resolution,overlap), fs, np.asarray(out, dtype=np.float32))
Beispiel #15
0
 def make_frames(self, signal, fs, frame_duration, overlap=0.5):
     nsamples_pframe = fs * frame_duration / 1000
     overlapframes = nsamples_pframe * overlap
     frames = segment_axis(signal, nsamples_pframe, overlapframes)
     return frames
Beispiel #16
0
    for fc in erb_space(150, 8000, n_channels)
]).flatten()
centers = np.array([
    gammatone_matrix(b, fc, resolution, step)[2] + i * resolution
    for i, fc in enumerate(erb_space(150, 8000, n_channels))
]).flatten()

# Load test signal
filename = 'data/fsew/fsew0_001.wav'
f = Sndfile(filename, 'r')
nf = f.nframes
fs = f.samplerate
length_sound = 20000
y = f.read_frames(5000)
y = f.read_frames(length_sound)
Y = segment_axis(y, resolution, overlap=overlap, end='pad')
Y = np.hanning(resolution) * Y

# Encoding with matching pursuit
X = np.zeros((Y.shape[0], D_multi.shape[0]))
for idx in range(Y.shape[0]):
    X[idx, :] = matching_pursuit(Y[idx, :], D_multi)

# Reconstruction of the signal
out = np.zeros(int((np.ceil(len(y) / resolution) + 1) * resolution))
for k in range(0, len(X)):
    idx = range(k * (resolution - overlap),
                k * (resolution - overlap) + resolution)
    out[idx] += np.dot(X[k], D_multi)
squared_error = np.sum((y - out[0:len(y)])**2)