Esempio n. 1
0
    def seg_data(data):
        first=False
        for samples_sequence in data:
            # Standardizeo
            #samples_sequence = (samples_sequence - _mean) / _std

            ##################
            min_value = min(samples_sequence)
            max_value = max(samples_sequence)
            samples_sequence = (samples_sequence-min_value)/(max_value-min_value)
            ##################

            samples_segmented_sequence = segment_axis(samples_sequence,frame_length,frame_length - 1)
            if not first:
                samples_sequences=samples_segmented_sequence
                first = True
            else:
                samples_sequences=np.append(samples_sequences,samples_segmented_sequence,axis=0)
        return samples_sequences
Esempio n. 2
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, audio_only=True,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        topo=None
        self.topo=topo
        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,
                                                         frame_length,
                                                         overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index
                    + self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index
                    + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones, dim=1,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
Esempio n. 3
0
    def __init__(self, which_set, frame_length, start=0, stop=None,
                 audio_only=False, rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in the sliding window
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        samples_sequences = []
        targets_sequences = []
        phones_sequences = []
        phonemes_sequences = []
        words_sequences = []
        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      frame_length - 1)[:-1]
            samples_sequences.append(samples_segmented_sequence)
            targets_sequences.append(samples_sequence[frame_length:].reshape(
                (samples_sequence[frame_length:].shape[0], 1)
            ))
            if not self.audio_only:
                target_phones = self.phones[sequence_id][frame_length:]
                phones_sequences.append(target_phones.reshape(
                    (target_phones.shape[0], 1)
                ))
                target_phonemes = self.phonemes[sequence_id][frame_length:]
                phonemes_sequences.append(target_phonemes.reshape(
                    (target_phonemes.shape[0], 1)
                ))
                target_words = self.words[sequence_id][frame_length:]
                words_sequences.append(target_words.reshape(
                    (target_words.shape[0], 1)
                ))

        del self.raw_wav
        self.samples_sequences = samples_sequences
        self.targets_sequences = targets_sequences
        self.data = [samples_sequences, targets_sequences]
        if not self.audio_only:
            del self.phones
            del self.phonemes
            del self.words
            self.phones_sequences = phones_sequences
            self.phonemes_sequences = phonemes_sequences
            self.words_sequences = words_sequences
            self.data.extend([phones_sequences, phonemes_sequences,
                              words_sequences])
        self.num_examples = len(samples_sequences)

        # DataSpecs
        features_space = VectorSequenceSpace(dim=self.frame_length)
        features_source = 'features'

        targets_space = VectorSequenceSpace(dim=1)
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSequenceSpace(
                max_labels=self.num_phones,
                dim=1,
                dtype=str(self.phones_sequences[0].dtype)
            )
            phones_source = 'phones'

            phonemes_space = IndexSequenceSpace(
                max_labels=self.num_phonemes,
                dim=1,
                dtype=str(self.phonemes_sequences[0].dtype)
            )
            phonemes_source = 'phonemes'

            words_space = IndexSequenceSpace(
                max_labels=self.num_words,
                dim=1,
                dtype=str(self.words_sequences[0].dtype)
            )
            words_source = 'words'

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
Esempio n. 4
0
def load_data_timit_seq(which_set='train', start=0, stop=None, lend=np.inf, wavtype=None, slice='N',normtype='std',ndata=10000, rand= 'N'):

    def preproc(data, normalize='std'):
        samples_sequences = np.zeros(data.shape)
        for ind, samples_sequence in enumerate(data):
            if normalize == 'std':
                # Standardizeo
                samples_sequence = (samples_sequence - _mean) / _std
            elif normalize == 'sigm':
                # For Sigmoid
                min_value = min(samples_sequence.flatten())
                max_value = max(samples_sequence.flatten())
                samples_sequence = (samples_sequence-min_value)/(max_value-min_value)
            elif normalize == 'tanh':
                # For tanh
                min_value = min(samples_sequence.flatten())
                max_value = max(samples_sequence.flatten())
                samples_sequence = \
                    (2*samples_sequence-max_value-min_value)/(max_value-min_value)
            samples_sequences[ind,:] = samples_sequence
        return samples_sequences
    
    if wavtype=='timit':
        datadir = '/data/lisa/exp/kimtaeho/speech_synthesis/datasets/'
        # Load the dataset
        if which_set in ['test','train','valid']:
            dataset = np.load(datadir + which_set+'_wav_aa.npy')
        else:
            ValueError('Invalid which_set')
        if rand=='Y':
            np.random.shuffle(dataset)
        dataset=dataset[start:stop]
        np.random.shuffle(dataset)
        
        if slice =='Y': 
            ov = lend/2
            nframes = 0
            for seq in dataset:
                if seq.shape[0] <=  lend:
                    nframes += 1
                else:
                    nframes += np.ceil(float(seq.shape[0]-lend)/ov) + 1
            ndataset=np.zeros((nframes,lend))
            print 'nframes '+str(nframes)
            ind = 0
            for seq in dataset:
                seg_seq = segment_axis(seq, lend, ov)
                ndataset[ind:ind+seg_seq.shape[0],:]=seg_seq
                ind+=seg_seq.shape[0]
        elif slice =='N':
            ndataset=np.zeros((dataset.shape[0],lend))
            for ind, seq in enumerate(dataset):
                if seq.shape[0] < lend:
                    ndataset[ind,:seq.shape[0]]=seq
                else:    
                    ndataset[ind,:]=seq[:lend]
        ndataset = preproc(data=ndataset, normalize=normtype)
    
    else:   
        # Data crop to make all sequences same length
        ndataset=np.zeros((ndata,lend))
        ramp_template=np.arange(ndataset.shape[1])
        sin_template=1*np.sin(ramp_template/5.)
        sin_template2=1*np.sin(ramp_template/10.)
        sin_template3=1*np.sin(ramp_template/100.)
        sin_template4=1*np.sin(ramp_template/3.)
        cos_template=1*np.cos(ramp_template/10.)
    
        if wavtype=='sin':
            ndataset = np.tile(sin_template,(ndata,1))
        elif wavtype=='chirps':
            from chirps2D import CHIRPS
            ndataset = CHIRPS(
                which_set = which_set, 
                nexamples = ndata,
                nvis=lend, total_length=lend).X
        elif wavtype=='sin2':
            ndataset = np.tile(sin_template2,(ndata,1))
        elif wavtype=='slowsin':
            fastsin = np.sin(ramp_template/100.)
            ndataset = np.tile(fastsin,(ndata,1))
        elif wavtype=='fastsin':
            fastsin = np.sin(ramp_template/3.)
            ndataset = np.tile(fastsin,(ndata,1))
        elif wavtype=='varsin':
            ndataset = np.tile(sin_template,(ndata,1))
            ndataset[::2,:] =sin_template2
        elif wavtype=='varsin2':
            ndataset = np.tile(ramp_template,(ndata,1))
            randnum = np.random.normal(5,10,(ndata,1))
            ndataset = np.sin(ndataset/randnum)
        elif wavtype=='cos':
            ndataset = np.tile(cos_template,(ndata,1))
        elif wavtype=='comp':
            ndataset = np.tile(0.5*sin_template+0.5*cos_template,(ndata,1))
        elif wavtype=='sumsin2':
            ndataset = np.tile(0.3*sin_template+0.3*sin_template2+sin_template3,(ndata,1))
        elif wavtype=='sumsin3':
            ndataset = np.tile(0.3*sin_template+0.3*sin_template2+0.3*sin_template3,(ndata,1))
        elif wavtype=='sumsin4':
            ndataset = np.tile(0.3*sin_template+0.3*sin_template2+0.3*sin_template4,(ndata,1))
        elif wavtype=='mulsin1':
            ndataset = np.tile(sin_template*sin_template3,(ndata,1))
        elif wavtype=='rect':
            rect_template=sin_template
            rect_template[rect_template>0]=1
            rect_template[rect_template<=0]=-1
            ndataset = np.tile(rect_template,(ndata,1))
        elif wavtype=='line':
            ndataset = np.zeros(ndataset.shape)
        elif wavtype=='line2':
            ndataset = np.zeros(ndataset.shape)+0.5
        elif wavtype=='mm':
            ndataset = np.zeros(ndataset.shape)
            ndataset[::2,:] = sin_template
        elif wavtype=='blob':
            x = np.linspace(-5,5,lend)
            mu=0
            sig = 1
            gau = np.exp(-np.power(x - mu, 2.) / 2 * np.power(sig, 2.)) 
            ndataset = np.tile(gau, (ndata,1))
        elif wavtype=='rectsin':
            rect_template=sin_template
            rect_template[rect_template>0]=1
            rect_template[rect_template<=0]=-1
            ndataset = np.tile(sin_template,(ndata,1))
            ndataset[::2,:] = rect_template
        elif wavtype=='noise':
            ndataset = np.zeros(ndataset.shape)
            ndataset = np.random.normal(ndataset,1.) 
        elif wavtype=='linsum':
            ndataset = np.tile(ramp_template,(ndata,1))
            randnum = np.random.uniform(5,10,(ndata,1))
            ndataset = np.sin(ndataset/randnum)
            multcoef = np.random.uniform(0,1,(ndata,1))
            ndataset = sum(ndataset * multcoef)
    thdataset = theano.shared(np.asarray(ndataset,dtype=theano.config.floatX), borrow=True)
    if 'dataset' in locals():
        del dataset
    return thdataset