def seg_data(data): first=False for samples_sequence in data: # Standardizeo #samples_sequence = (samples_sequence - _mean) / _std ################## min_value = min(samples_sequence) max_value = max(samples_sequence) samples_sequence = (samples_sequence-min_value)/(max_value-min_value) ################## samples_segmented_sequence = segment_axis(samples_sequence,frame_length,frame_length - 1) if not first: samples_sequences=samples_segmented_sequence first = True else: samples_sequences=np.append(samples_sequences,samples_segmented_sequence,axis=0) return samples_sequences
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=True, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only topo=None self.topo=topo # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.frame_length * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in the sliding window start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] samples_sequences = [] targets_sequences = [] phones_sequences = [] phonemes_sequences = [] words_sequences = [] for sequence_id, samples_sequence in enumerate(self.raw_wav): # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, frame_length - 1)[:-1] samples_sequences.append(samples_segmented_sequence) targets_sequences.append(samples_sequence[frame_length:].reshape( (samples_sequence[frame_length:].shape[0], 1) )) if not self.audio_only: target_phones = self.phones[sequence_id][frame_length:] phones_sequences.append(target_phones.reshape( (target_phones.shape[0], 1) )) target_phonemes = self.phonemes[sequence_id][frame_length:] phonemes_sequences.append(target_phonemes.reshape( (target_phonemes.shape[0], 1) )) target_words = self.words[sequence_id][frame_length:] words_sequences.append(target_words.reshape( (target_words.shape[0], 1) )) del self.raw_wav self.samples_sequences = samples_sequences self.targets_sequences = targets_sequences self.data = [samples_sequences, targets_sequences] if not self.audio_only: del self.phones del self.phonemes del self.words self.phones_sequences = phones_sequences self.phonemes_sequences = phonemes_sequences self.words_sequences = words_sequences self.data.extend([phones_sequences, phonemes_sequences, words_sequences]) self.num_examples = len(samples_sequences) # DataSpecs features_space = VectorSequenceSpace(dim=self.frame_length) features_source = 'features' targets_space = VectorSequenceSpace(dim=1) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] batch_components = [None, None] if not self.audio_only: phones_space = IndexSequenceSpace( max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype) ) phones_source = 'phones' phonemes_space = IndexSequenceSpace( max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype) ) phonemes_source = 'phonemes' words_space = IndexSequenceSpace( max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype) ) words_source = 'words' space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def load_data_timit_seq(which_set='train', start=0, stop=None, lend=np.inf, wavtype=None, slice='N',normtype='std',ndata=10000, rand= 'N'): def preproc(data, normalize='std'): samples_sequences = np.zeros(data.shape) for ind, samples_sequence in enumerate(data): if normalize == 'std': # Standardizeo samples_sequence = (samples_sequence - _mean) / _std elif normalize == 'sigm': # For Sigmoid min_value = min(samples_sequence.flatten()) max_value = max(samples_sequence.flatten()) samples_sequence = (samples_sequence-min_value)/(max_value-min_value) elif normalize == 'tanh': # For tanh min_value = min(samples_sequence.flatten()) max_value = max(samples_sequence.flatten()) samples_sequence = \ (2*samples_sequence-max_value-min_value)/(max_value-min_value) samples_sequences[ind,:] = samples_sequence return samples_sequences if wavtype=='timit': datadir = '/data/lisa/exp/kimtaeho/speech_synthesis/datasets/' # Load the dataset if which_set in ['test','train','valid']: dataset = np.load(datadir + which_set+'_wav_aa.npy') else: ValueError('Invalid which_set') if rand=='Y': np.random.shuffle(dataset) dataset=dataset[start:stop] np.random.shuffle(dataset) if slice =='Y': ov = lend/2 nframes = 0 for seq in dataset: if seq.shape[0] <= lend: nframes += 1 else: nframes += np.ceil(float(seq.shape[0]-lend)/ov) + 1 ndataset=np.zeros((nframes,lend)) print 'nframes '+str(nframes) ind = 0 for seq in dataset: seg_seq = segment_axis(seq, lend, ov) ndataset[ind:ind+seg_seq.shape[0],:]=seg_seq ind+=seg_seq.shape[0] elif slice =='N': ndataset=np.zeros((dataset.shape[0],lend)) for ind, seq in enumerate(dataset): if seq.shape[0] < lend: ndataset[ind,:seq.shape[0]]=seq else: ndataset[ind,:]=seq[:lend] ndataset = preproc(data=ndataset, normalize=normtype) else: # Data crop to make all sequences same length ndataset=np.zeros((ndata,lend)) ramp_template=np.arange(ndataset.shape[1]) sin_template=1*np.sin(ramp_template/5.) sin_template2=1*np.sin(ramp_template/10.) sin_template3=1*np.sin(ramp_template/100.) sin_template4=1*np.sin(ramp_template/3.) cos_template=1*np.cos(ramp_template/10.) if wavtype=='sin': ndataset = np.tile(sin_template,(ndata,1)) elif wavtype=='chirps': from chirps2D import CHIRPS ndataset = CHIRPS( which_set = which_set, nexamples = ndata, nvis=lend, total_length=lend).X elif wavtype=='sin2': ndataset = np.tile(sin_template2,(ndata,1)) elif wavtype=='slowsin': fastsin = np.sin(ramp_template/100.) ndataset = np.tile(fastsin,(ndata,1)) elif wavtype=='fastsin': fastsin = np.sin(ramp_template/3.) ndataset = np.tile(fastsin,(ndata,1)) elif wavtype=='varsin': ndataset = np.tile(sin_template,(ndata,1)) ndataset[::2,:] =sin_template2 elif wavtype=='varsin2': ndataset = np.tile(ramp_template,(ndata,1)) randnum = np.random.normal(5,10,(ndata,1)) ndataset = np.sin(ndataset/randnum) elif wavtype=='cos': ndataset = np.tile(cos_template,(ndata,1)) elif wavtype=='comp': ndataset = np.tile(0.5*sin_template+0.5*cos_template,(ndata,1)) elif wavtype=='sumsin2': ndataset = np.tile(0.3*sin_template+0.3*sin_template2+sin_template3,(ndata,1)) elif wavtype=='sumsin3': ndataset = np.tile(0.3*sin_template+0.3*sin_template2+0.3*sin_template3,(ndata,1)) elif wavtype=='sumsin4': ndataset = np.tile(0.3*sin_template+0.3*sin_template2+0.3*sin_template4,(ndata,1)) elif wavtype=='mulsin1': ndataset = np.tile(sin_template*sin_template3,(ndata,1)) elif wavtype=='rect': rect_template=sin_template rect_template[rect_template>0]=1 rect_template[rect_template<=0]=-1 ndataset = np.tile(rect_template,(ndata,1)) elif wavtype=='line': ndataset = np.zeros(ndataset.shape) elif wavtype=='line2': ndataset = np.zeros(ndataset.shape)+0.5 elif wavtype=='mm': ndataset = np.zeros(ndataset.shape) ndataset[::2,:] = sin_template elif wavtype=='blob': x = np.linspace(-5,5,lend) mu=0 sig = 1 gau = np.exp(-np.power(x - mu, 2.) / 2 * np.power(sig, 2.)) ndataset = np.tile(gau, (ndata,1)) elif wavtype=='rectsin': rect_template=sin_template rect_template[rect_template>0]=1 rect_template[rect_template<=0]=-1 ndataset = np.tile(sin_template,(ndata,1)) ndataset[::2,:] = rect_template elif wavtype=='noise': ndataset = np.zeros(ndataset.shape) ndataset = np.random.normal(ndataset,1.) elif wavtype=='linsum': ndataset = np.tile(ramp_template,(ndata,1)) randnum = np.random.uniform(5,10,(ndata,1)) ndataset = np.sin(ndataset/randnum) multcoef = np.random.uniform(0,1,(ndata,1)) ndataset = sum(ndataset * multcoef) thdataset = theano.shared(np.asarray(ndataset,dtype=theano.config.floatX), borrow=True) if 'dataset' in locals(): del dataset return thdataset