def create_input_space(self): ws = (self.ws * 2 + 1) return CompositeSpace([ IndexSpace(max_labels=self.vocab_size, dim=ws), IndexSpace(max_labels=self.total_feats, dim=self.feat_num), VectorSpace(dim=self.extender_dim * ws) ])
def set_topological_view(self, V, axes=('b', 0, 1, 'c')): """ Sets the dataset to represent V, where V is a batch of topological views of examples. .. todo:: Why is this parameter named 'V'? Parameters ---------- V : ndarray An array containing a design matrix representation of training examples. axes : WRITEME """ assert not contains_nan(V) rows = V.shape[axes.index(0)] cols = V.shape[axes.index(1)] channels = V.shape[axes.index('c')] self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(V) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space assert not contains_nan(self.X) # Update data specs X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] # This is to support old pickled models if getattr(self, 'y_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.y_labels) elif getattr(self, 'max_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.max_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' Latent_space = VectorSpace(dim=self.latent.shape[-1]) Latent_source = 'latents' space = CompositeSpace((X_space, y_space,Latent_space)) source = (X_source, y_source,Latent_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source)
def set_spaces(self, dataset, dim_X, dim_Y, m): dataset.X_space = IndexSpace(dim=dim_X, max_labels=m) dataset.y_space = IndexSpace(dim=dim_Y, max_labels=m) X_source = 'features' y_source = 'targets' space = CompositeSpace((dataset.X_space, dataset.y_space)) source = (X_source, y_source) dataset._iter_data_specs = (dataset.X_space, 'features') dataset.data_specs = (space, source)
def set_topological_view(self, V, axes=('b', 0, 1, 'c')): """ Set up dataset topological view, without building an in-memory design matrix. This is mostly copied from DenseDesignMatrix, except: * HDF5ViewConverter is used instead of DefaultViewConverter * Data specs are derived from topo_view, not X * NaN checks have been moved to HDF5DatasetIterator.next Note that y may be loaded into memory for reshaping if y.ndim != 2. Parameters ---------- V : ndarray Topological view. axes : tuple, optional (default ('b', 0, 1, 'c')) Order of axes in topological view. """ shape = [ V.shape[axes.index('b')], V.shape[axes.index(0)], V.shape[axes.index(1)], V.shape[axes.index('c')] ] self.view_converter = HDF5ViewConverter(shape[1:], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(V) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space # Update data specs X_space = VectorSpace(dim=V.shape[axes.index('b')]) X_source = 'features' if self.y is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] # check if y_labels has been specified if getattr(self, 'y_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.y_labels) elif getattr(self, 'max_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.max_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source)
def test_np_format_as_index2index(): index_space_initial = IndexSpace(max_labels=10, dim=1) index_space_final = IndexSpace(max_labels=10, dim=1) data = np.array([[0], [2], [1], [3], [5], [8], [1]]) rval = index_space_initial.np_format_as(data, index_space_final) assert index_space_initial == index_space_final assert np.all(rval == data) index_space_downcast = IndexSpace(max_labels=10, dim=1, dtype='int32') rval = index_space_initial.np_format_as(data, index_space_downcast) assert index_space_initial != index_space_downcast assert np.all(rval == data) assert rval.dtype == 'int32' and data.dtype == 'int64'
def __init__(self, which_set, which_day, path=None): self.daylist = range(21, 32) self.mapper = {'train': 0, 'valid': 1, 'test': 2} assert which_set in self.mapper.keys() assert which_day in self.daylist f = open('/home/whale/Documents/click/dayrows.pkl', 'r') self.dayrows = cPickle.load(f) f.close() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode='r') if which_set == 'test': test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] else: train_group = self.h5file.root.train.train_raw self.X = train_group.X self.y = train_group.y self.samples = slice(sum(self.dayrows[:which_day - 21]), sum(self.dayrows[:which_day - 20])) self.sample_index = self.samples.start self.examples = self.dayrows[which_day - 21] max_labels = 2 X_source = 'features' X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features')
def __init__(self, which_set, path=None): self.mapper = {'train': 0, 'valid': 1, 'test': 2} assert which_set in self.mapper.keys() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode='r') if which_set == 'test': test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None else: train_group = self.h5file.root.train.train_raw if which_set == 'train': self.X = train_group.X_train self.y = train_group.y_train else: self.X = train_group.X_valid self.y = train_group.y_valid self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] max_labels = 2 X_source = 'features' X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features') self._iter_subset_class = resolve_iterator_class('even_sequential')
def test_y_index_space(self): """ Tests that requesting the targets to be in IndexSpace and iterating over them works """ data_specs = (IndexSpace(max_labels=10, dim=1), 'targets') it = self.test.iterator(mode='sequential', data_specs=data_specs, batch_size=100) for y in it: pass
class BowmanWordnetDataset(object): dtype = 'int32' input_components = (IndexSpace(dim=1, max_labels=BWD_vertex_count, dtype=dtype), IndexSpace(dim=1, max_labels=BWD_vertex_count, dtype=dtype)) input_source = ('left_input', 'right_input') input_space = CompositeSpace(components=input_components) chromaticity = 3 target_component = IndexSpace(dim=1, max_labels=chromaticity, dtype=dtype) target_source = ('target', ) data_specs = (CompositeSpace(components=(input_components[0], input_components[1], target_component)), (input_source[0], input_source[1], target_source[0])) def __init__(self): pass
def test_np_format_as_sequence2other(): vector_sequence_space = VectorSequenceSpace(dim=3) vector_space = VectorSpace(dim=3) data = np.random.uniform(low=0.0, high=1.0, size=(10, 3)) np.testing.assert_raises(ValueError, vector_sequence_space.np_format_as, data, vector_space) index_sequence_space = IndexSequenceSpace(max_labels=6, dim=1) index_space = IndexSpace(max_labels=6, dim=1) data = np.random.randint(low=0, high=5, size=(10, 1)) np.testing.assert_raises(ValueError, index_sequence_space.np_format_as, data, index_space)
def set_topological_view(self, V, axes=('b', 0, 1, 'c')): """ Sets the dataset to represent V, where V is a batch of topological views of examples. Parameters ---------- V : ndarray An array containing a design matrix representation of training \ examples. axes : WRITEME .. todo:: Why is this parameter named 'V'? """ assert not np.any(np.isnan(V)) rows = V.shape[axes.index(0)] cols = V.shape[axes.index(1)] channels = V.shape[axes.index('c')] self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(V) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space assert not np.any(np.isnan(self.X)) # Update data specs X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: if self.y.ndim != 2: assert self.max_labels y_space = IndexSpace(max_labels=self.max_labels, dim=1) y_source = 'targets' else: y_space = VectorSpace(dim=self.y.shape[-1]) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source)
def __init__(self, which_set, data_mode, context_len=None, shuffle=True): self._load_data(which_set, context_len, data_mode) source = ('features', 'targets') space = CompositeSpace([ SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)), SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)) ]) if context_len is None: context_len = len(self._raw_data) - 1 X = np.asarray([ self._raw_data[:-1][i * context_len:(i + 1) * context_len, np.newaxis] for i in range( int(np.ceil((len(self._raw_data) - 1) / float(context_len)))) ]) y = np.asarray([ self._raw_data[1:][i * context_len:(i + 1) * context_len, np.newaxis] for i in range( int(np.ceil((len(self._raw_data) - 1) / float(context_len)))) ]) super(PennTreebankSequences, self).__init__(data=(X, y), data_specs=(space, source))
def create_model(self): # input will be projected. ProjectionLayer? MatrixMul? IndexSpace? input_ = ProjectionLayer(layer_name='X', dim=self.edim, irange=0.) # sparse_init=15? h0 = Tanh(layer_name='h0', dim=self.hdim, irange=.01) output = Softmax(layer_name='softmax', n_classes=self.vocab_size, irange=0., binary_target_dim=1) input_space = IndexSpace(max_labels=self.vocab_size, dim=self.window_size) model = MLP(layers=[input_, h0, output], input_space=input_space) self.model = model
def test_compare_index(): dims = [5, 5, 5, 6] max_labels = [10, 10, 9, 10] index_spaces = [ IndexSpace(dim=dim, max_labels=max_label) for dim, max_label in zip(dims, max_labels) ] assert index_spaces[0] == index_spaces[1] assert not any(index_spaces[i] == index_spaces[j] for i, j in itertools.combinations([1, 2, 3], 2)) vector_space = VectorSpace(dim=5) conv2d_space = Conv2DSpace(shape=(8, 8), num_channels=3, axes=('b', 'c', 0, 1)) composite_space = CompositeSpace((index_spaces[0], )) assert not any(index_space == vector_space for index_space in index_spaces) assert not any(index_space == composite_space for index_space in index_spaces) assert not any(index_space == conv2d_space for index_space in index_spaces)
class ChainDataset(Dataset): """Training data generator. Supports the PyLearn2 dataset interface. """ num_states = 3 trans_prob = numpy.array([[0.1, 0.5, 0.4], [0.1, 0.9, 0.0], [0.3, 0.3, 0.4]]) values, vectors = numpy.linalg.eig(trans_prob.T) equilibrium = vectors[:, values.argmax()] equilibrium = equilibrium / equilibrium.sum() trans_entropy = trans_prob * numpy.log(trans_prob + 1e-6) entropy = equilibrium.dot(trans_entropy).sum() data_specs = (SequenceDataSpace(IndexSpace(max_labels=num_states, dim=1)), 'x') def __init__(self, rng, seq_len): update_instance(self, locals()) def iterator(self, batch_size, data_specs, return_tuple, mode, num_batches, rng=None): """Returns a PyLearn2 compatible iterator.""" assert return_tuple dataset = self class Iterator(six.Iterator): # This is not true, but let PyLearn2 think that this # iterator is not stochastic. # Makes life easier for now. stochastic = False num_examples = num_batches * batch_size def __init__(self, **kwargs): self.batches_retrieved = 0 def __iter__(self): return self def __next__(self): if self.batches_retrieved < num_batches: self.batches_retrieved += 1 return (dataset._next_batch(batch_size)[..., None],) raise StopIteration() return Iterator() def get_num_examples(self): """Part of the PyLearn2 Dataset interface.""" return float('inf') def _next_single(self): states = [0] while len(states) != self.seq_len: states.append(numpy.random.multinomial( 1, self.trans_prob[states[-1]]).argmax()) return states def _next_batch(self, batch_size): """Generate random sequences from the family.""" x = numpy.zeros((self.seq_len, batch_size), dtype='int64') for i in range(batch_size): x[:, i] = self._next_single() return x
def __init__(self, path, data_node, transformer, X_str, s_str, y_str=None, y_labels=None, start=0, stop=None, axes=('b', 0, 1, 'c'), rescale=None, rng=_default_seed): # Locally cache the files before reading them path = preprocess(path) datasetCache = cache.datasetCache path = datasetCache.cache_file(path) self.h5file = tables.openFile(path, mode="r") node = self.h5file.getNode('/', data_node) self.rescale = float(rescale) self.rng = make_np_rng(rng, which_method="random_integers") self.X = getattr(node, X_str) # Make sure images have values in [0, 1]. This is needed for # self.adjust_for_viewer, amongst other things. if not numpy.all(x >= 0 and x <= 1 for x in self.X.iterrows()): raise ValueError("features must be normalized between 0 and 1") self.axes = axes self.s = getattr(node, s_str) self.y = getattr(node, y_str) if y_str is not None else None self.y_labels = y_labels self._check_labels() self.transformer = transformer X_source = 'features' shape = self.transformer.get_shape() channels = self.s[0][-1] X_space = Conv2DSpace(shape=shape, num_channels=channels, axes=axes) if self.y is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] if self.y_labels is not None: y_space = IndexSpace(dim=dim, max_labels=self.y_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) # Defaults for iterators self._iter_mode = resolve_iterator_class( 'batchwise_shuffled_sequential') self._iter_data_specs = self.data_specs self.start = start self.stop = self.X.shape[0] if stop is None else stop assert (self.start >= 0 and self.start < self.stop and self.stop <= self.X.shape[0]) self.num_examples = self.stop - self.start # Data buffers self.X_buffer = None self.y_buffer = None
def test_np_format_as_index2vector(): # Test 5 random batches for shape, number of non-zeros for _ in xrange(5): max_labels = np.random.randint(2, 10) batch_size = np.random.randint(1, 10) labels = np.random.randint(1, 10) batch = np.random.random_integers(max_labels - 1, size=(batch_size, labels)) index_space = IndexSpace(dim=labels, max_labels=max_labels) vector_space_merge = VectorSpace(dim=max_labels) vector_space_concatenate = VectorSpace(dim=max_labels * labels) merged = index_space.np_format_as(batch, vector_space_merge) concatenated = index_space.np_format_as(batch, vector_space_concatenate) assert merged.shape == (batch_size, max_labels) assert concatenated.shape == (batch_size, max_labels * labels) assert np.count_nonzero(merged) <= batch.size assert np.count_nonzero(concatenated) == batch.size assert np.all(np.unique(concatenated) == np.array([0, 1])) # Make sure Theano variables give the same result batch = tensor.lmatrix('batch') single = tensor.lvector('single') batch_size = np.random.randint(1, 10) np_batch = np.random.random_integers(max_labels - 1, size=(batch_size, labels)) np_single = np.random.random_integers(max_labels - 1, size=(labels)) f_batch_merge = theano.function( [batch], index_space._format_as_impl(False, batch, vector_space_merge) ) f_batch_concatenate = theano.function( [batch], index_space._format_as_impl(False, batch, vector_space_concatenate) ) f_single_merge = theano.function( [single], index_space._format_as_impl(False, single, vector_space_merge) ) f_single_concatenate = theano.function( [single], index_space._format_as_impl(False, single, vector_space_concatenate) ) np.testing.assert_allclose( f_batch_merge(np_batch), index_space._format_as_impl(True, np_batch, vector_space_merge) ) np.testing.assert_allclose( f_batch_concatenate(np_batch), index_space._format_as_impl(True, np_batch, vector_space_concatenate) ) np.testing.assert_allclose( f_single_merge(np_single), index_space._format_as_impl(True, np_single, vector_space_merge) ) np.testing.assert_allclose( f_single_concatenate(np_single), index_space._format_as_impl(True, np_single, vector_space_concatenate) )
num_dev = int(X.shape[0] * 0.01 * args.devpercent) log.info('.. Using %f percent of data (%d examples) as development data', args.devpercent, num_dev) if num_dev > 0: X_dev = X[-num_dev:] X = X[:-num_dev] speakers_dev = speakers[-num_dev:] speakers = speakers[:-num_dev] y_dev = y[-num_dev:] y = y[:-num_dev] space = CompositeSpace([ VectorSpace(dim=len(feature_dict)), IndexSpace(dim=1, max_labels=num_speakers), VectorSpace(dim=1) ]) source = ('features', 'speakers', 'targets') final_dataset = vector_spaces_dataset.VectorSpacesDataset( data=(X, speakers, y), data_specs=(space, source)) if args.save_filename: log.info('.. Writing data to %s', args.save_filename) serial.save(args.save_filename, final_dataset) if args.savedev_filename: log.info('.. Writing dev data to %s', args.savedev_filename) final_dataset_dev = vector_spaces_dataset.VectorSpacesDataset( data=(X_dev, speakers_dev, y_dev), data_specs=(space, source))
def __init__(self, which_set, center=False, rescale=False, gcn=None, one_hot=None, start=None, stop=None, axes=('b', 'c', 0, 1), preprocessor = None, noise_v=0., noise_a=0.): # note: there is no such thing as the cifar10 validation set; # pylearn1 defined one but really it should be user-configurable # (as it is here) self.noise_a = noise_a self.noise_v = noise_v self.axes = axes # we define here: dtype = 'float32' ntrain = 288 nvalid = 0 # artefact, we won't use it ntest = 72 # we also expose the following details: self.img_shape = (1, 32, 32) self.img_size = N.prod(self.img_shape) self.n_classes = 36 self.label_names = ['fadg0', 'fcft0', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] # prepare loading ''' fnames = ['data_batch_%i' % i for i in range(1, 6)] lenx = N.ceil((ntrain + nvalid) / 10000.)*10000 x = N.zeros((lenx, self.img_size), dtype=dtype) y = N.zeros((lenx, 1), dtype=dtype) # load train data nloaded = 0 for i, fname in enumerate(fnames): data = CIFAR10._unpickle(fname) x[i*10000:(i+1)*10000, :] = data['data'] y[i*10000:(i+1)*10000, 0] = data['labels'] nloaded += 10000 if nloaded >= ntrain + nvalid + ntest: break ''' path1 = os.path.join(serial.preprocess('${VIDTIMIT}'), 'data', 'cut_dataset.pkl') video = cPickle.load(file(path1)) path2 = os.path.join(serial.preprocess('${VIDTIMIT}'), 'data', 'audio_dataset.pkl') audio = cPickle.load(file(path2)) # process this data train_idx = video['train_idx']==1 test_idx = video['test_idx']==1 if noise_v==0.: v_noise_tr = 0. v_noise_te = 0. else: v_noise_tr = np.random.normal(0., noise_v, size=video['data'][train_idx].shape) v_noise_te = np.random.normal(0., noise_v, size=video['data'][test_idx].shape) if noise_a==0.: a_noise_tr = 0. a_noise_te = 0. else: a_noise_tr = np.random.normal(0., noise_a, audio['data'][train_idx].shape) a_noise_te = np.random.normal(0., noise_a, audio['data'][test_idx].shape) vXs = {'train': video['data'][train_idx] + v_noise_tr, 'test': video['data'][test_idx] + v_noise_te} Ys = {'train': video['labels'][train_idx], 'test': video['labels'][test_idx]} vX = N.cast['float32'](vXs[which_set]) y = Ys[which_set][np.newaxis].T.astype('int32') aXs = {'train': audio['data'][train_idx] + a_noise_tr, 'test': audio['data'][test_idx] + a_noise_te} aX = N.cast['float32'](aXs[which_set]) if isinstance(y, list): y = np.asarray(y).astype('int32') if which_set == 'test': assert y.shape[0] == 72 y = y.reshape((y.shape[0], 1)) max_labels = 36 if one_hot is not None: ynew = np.zeros((y.shape[0], 36)) for i in range(y.shape[0]): ynew[y[i]] = 1 warnings.warn("the `one_hot` parameter is deprecated. To get " "one-hot encoded targets, request that they " "live in `VectorSpace` through the `data_specs` " "parameter of MNIST's iterator method. " "`one_hot` will be removed on or after " "September 20, 2014.", stacklevel=2) y = ynew.astype('int32') if center: vX -= 0.2845 aX -= -0.00003 self.center = center if rescale: vX /= .1644 aX /= .02 self.rescale = rescale if start is not None: # This needs to come after the prepro so that it doesn't # change the pixel means computed above for toronto_prepro assert start >= 0 assert stop > start assert stop <= vX.shape[0] vX = vX[start:stop, :] aX = aX[start:stop, :] y = y[start:stop, :] assert vX.shape[0] == y.shape[0] if which_set == 'test': assert vX.shape[0] == 72 super(FULLVIDTIMIT, self).__init__( (vX, aX, y), (CompositeSpace([ Conv2DSpace(shape=[32,32], num_channels=54, axes=['b','c',0,1]), Conv2DSpace(shape=[1000,1], num_channels=54, axes=['b','c',0,1]), IndexSpace(36,1)]), ('video','audio','targets'))) #assert not contains_nan(self.X) if preprocessor: preprocessor.apply(self)
def __init__(self, which_set, start=None, stop=None, center=False, rescale=False, path='./', axes=('b', 'c')): # we also expose the following details: self.shape = (128) self.size = np.prod(self.shape) self.n_classes = 2 if which_set not in ['train', 'test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') def dimshuffle(bc): """ .. todo:: WRITEME """ default = ('b', 'c') return bc.transpose(*[default.index(axis) for axis in axes]) if which_set == 'train': im_path = path + 'trainSIFT_vectors.npy' y_path = path + 'trainSIFT_labels.npy' else: assert which_set == 'test' im_path = path + 'testSIFT_vectors.npy' y_path = path + 'testSIFT_labels.npy' time1 = time.time() X = np.load(im_path).reshape((-1, 128)) Y = np.load(y_path).reshape((-1, 1)) time2 = time.time() print 'Loading data took %0.3f ms' % ((time2 - time1) * 1000.0) y_labels = 2 m, r = X.shape assert r == 128 source = ('features', 'targets') space = CompositeSpace( [VectorSpace(128), IndexSpace(dim=1, max_labels=2)]) self.X = X self.y = Y assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start super(FoodData, self).__init__(data=(self.X, self.y), data_specs=(space, source))
def __init__(self, X_load_path=None, X_from_scipy_sparse_dataset=None, X_zipped_npy=False, y_path=None, y_labels=None, y_part=None, rng=_default_seed): if X_load_path is not None: if X_zipped_npy is True: logger.info('... loading sparse data set from a zip npy file') self.X = scipy.sparse.csr_matrix(numpy.load( gzip.open(X_load_path)), dtype=floatX) else: logger.info('... loading sparse data set from a npy file') loader = numpy.load(X_load_path) self.X = scipy.sparse.csr_matrix((loader['data'], \ loader['indices'], loader['indptr']), \ shape = loader['shape'], dtype=floatX) else: logger.info('... building from given sparse dataset') self.X = X_from_scipy_sparse_dataset if not scipy.sparse.issparse(X_from_scipy_sparse_dataset): msg = "from_scipy_sparse_dataset is not sparse : %s" \ % type(self.X) raise TypeError(msg) if y_path is not None: logger.info('... loading y data set from a hdf5 file') file_handler = tables.open_file(y_path, mode="r") y = file_handler.root.train.train_raw.y assert y_part is not None f = open('dayrows.pkl', 'r') dayrows = cPickle.load(f) f.close() self.y = y[sum(dayrows[:y_part - 1]):sum(dayrows[:y_part])] self.y_labels = y_labels X_source = 'features' X_space = VectorSpace(dim=self.X.shape[1], sparse=True) if y_path is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] if self.y_labels is not None: y_space = IndexSpace(dim=dim, max_labels=self.y_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features')
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max( [numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max( [numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max( [numpy.max(sequence) for sequence in self.words]) + 1 # The following is hard coded. However, the way it is done above # could be problematic if a max value (the max over the whole # dataset (train + valid + test)) is not present in at least one # one of the three subsets. This is the case for speakers. This is # not the case for phones. self.num_speakers = 630 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] self.speaker_id = self.speaker_id[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] self.speaker_id = self.speaker_id[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis( phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis( phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis( words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.frame_length * self.frames_per_example) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index] [example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=self.num_phones, dim=1, dtype=str( self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phones_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1, dtype=str( self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phonemes_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.words_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1, dtype=str(self.speaker_id.dtype)) speaker_id_source = 'speaker_id' def speaker_id_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.speaker_id[sequence_index].ravel()) return rval dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32') dialect_source = 'dialect' def dialect_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[1:9])) return rval education_space = IndexSpace(max_labels=6, dim=1, dtype='int32') education_source = 'education' def education_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[9:15])) return rval race_space = IndexSpace(max_labels=8, dim=1, dtype='int32') race_source = 'race' def race_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[16:24])) return rval gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32') gender_source = 'gender' def gender_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[24:])) return rval space_components.extend([ phones_space, phonemes_space, words_space, speaker_id_space, dialect_space, education_space, race_space, gender_space ]) source_components.extend([ phones_source, phonemes_source, words_source, speaker_id_source, dialect_source, education_source, race_source, gender_source ]) map_fn_components.extend([ phones_map_fn, phonemes_map_fn, words_map_fn, speaker_id_map_fn, dialect_map_fn, education_map_fn, race_map_fn, gender_map_fn ]) batch_components.extend( [None, None, None, None, None, None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
args.read_features_filename) feature_dict = OrderedDict( durmodel_utils.get_features(args.read_features_filename)) else: log.info('.. Reading features from full_features_and_durs') feature_dict = OrderedDict() for (_features, _speaker_id, d) in full_features_and_durs: for f in _features: feature_name = f[0] feature_dict.setdefault(feature_name, len(feature_dict)) if args.write_features_filename: log.info('.. Writing features to %s', args.write_features_filename) durmodel_utils.write_features( args.write_features_filename, feature_dict) matrices = create_matrices(feature_dict, full_features_and_durs) space = CompositeSpace([VectorSpace(dim=len(feature_dict)), IndexSpace(dim=1, max_labels=len(speaker_ids)), VectorSpace(dim=1)]) source = ('features', 'speakers', 'targets') dataset = vector_spaces_dataset.VectorSpacesDataset( data=matrices, data_specs=(space, source) ) if args.save_filename: log.info('.. Writing data to %s', args.save_filename) serial.save(args.save_filename, dataset)
def test_np_format_as_index2vector(): # Test 5 random batches for shape, number of non-zeros for _ in xrange(5): max_labels = np.random.randint(2, 10) batch_size = np.random.randint(1, 10) labels = np.random.randint(1, 10) batch = np.random.random_integers(max_labels - 1, size=(batch_size, labels)) index_space = IndexSpace(dim=labels, max_labels=max_labels) vector_space_merge = VectorSpace(dim=max_labels) vector_space_concatenate = VectorSpace(dim=max_labels * labels) merged = index_space.np_format_as(batch, vector_space_merge) concatenated = index_space.np_format_as(batch, vector_space_concatenate) if batch_size > 1: assert merged.shape == (batch_size, max_labels) assert concatenated.shape == (batch_size, max_labels * labels) else: assert merged.shape == (max_labels, ) assert concatenated.shape == (max_labels * labels, ) assert np.count_nonzero(merged) <= batch.size assert np.count_nonzero(concatenated) == batch.size assert np.all(np.unique(concatenated) == np.array([0, 1])) # Make sure Theano variables give the same result batch = tensor.lmatrix('batch') single = tensor.lvector('single') batch_size = np.random.randint(2, 10) np_batch = np.random.random_integers(max_labels - 1, size=(batch_size, labels)) np_single = np.random.random_integers(max_labels - 1, size=(labels)) f_batch_merge = theano.function([batch], index_space._format_as( batch, vector_space_merge)) f_batch_concatenate = theano.function([batch], index_space._format_as( batch, vector_space_concatenate)) f_single_merge = theano.function([single], index_space._format_as( single, vector_space_merge)) f_single_concatenate = theano.function([single], index_space._format_as( single, vector_space_concatenate)) np.testing.assert_allclose( f_batch_merge(np_batch), index_space.np_format_as(np_batch, vector_space_merge)) np.testing.assert_allclose( f_batch_concatenate(np_batch), index_space.np_format_as(np_batch, vector_space_concatenate)) np.testing.assert_allclose( f_single_merge(np_single), index_space.np_format_as(np_single, vector_space_merge)) np.testing.assert_allclose( f_single_concatenate(np_single), index_space.np_format_as(np_single, vector_space_concatenate))
def __init__( self, path, name='', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner=None, channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_map=None, # optional conversion of labels remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter=None, # windowing parameters frame_size=-1, hop_size=-1, # values > 0 will lead to windowing hop_fraction=None, # alternative to specifying absolute hop_size # # optional spectrum parameters, n_fft = 0 keeps raw data # n_fft = 0, # n_freq_bins = None, # spectrum_log_amplitude = False, # spectrum_normalization_mode = None, # include_phase = False, flatten_channels=False, # layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time # save_matrix_path = None, keep_metadata=False, target_mode='label', ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters( [subjects, trial_types, trial_numbers, conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) # self.include_phase = include_phase # self.spectrum_normalization_mode = spectrum_normalization_mode # self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [ ] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] targets = [] n_sequences = 0 print(hop_size) if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) print(hop_size) if target_mode == 'next': # get 1 more value per frame as target frame_size += 1 # print 'frame size: {}'.format(frame_size) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) # data, metadata = self.generate_test_data() label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] multi_channel_targets = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # print samples # subtract channel mean #FIXME if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] # print start_sample, stop_sample, samples.shape # if n_fft is not None and n_fft > 0: # Optionally: # ### frequency spectrum branch ### # # # transform to spectogram # hop_length = n_fft / 4; # # ''' # from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html # >>> # Get a power spectrogram from a waveform y # >>> S = np.abs(librosa.stft(y)) ** 2 # >>> log_S = librosa.logamplitude(S) # ''' # # S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # # mag = np.abs(S) # magnitude spectrum # mag = np.abs(S)**2 # power spectrum # # # include phase information if requested # if self.include_phase: # # phase = np.unwrap(np.angle(S)) # phase = np.angle(S) # # # Optionally: cut off high bands # if n_freq_bins is not None: # mag = mag[0:n_freq_bins, :] # if self.include_phase: # phase = phase[0:n_freq_bins, :] # # if self.spectrum_log_amplitude: # mag = librosa.logamplitude(mag) # # s = mag # for normalization # # ''' # NOTE on normalization: # It depends on the structure of a neural network and (even more) # on the properties of data. There is no best normalization algorithm # because if there would be one, it would be used everywhere by default... # # In theory, there is no requirement for the data to be normalized at all. # This is a purely practical thing because in practice convergence could # take forever if your input is spread out too much. The simplest would be # to just normalize it by scaling your data to (-1,1) (or (0,1) depending # on activation function), and in most cases it does work. If your # algorithm converges well, then this is your answer. If not, there are # too many possible problems and methods to outline here without knowing # the actual data. # ''' # # ## normalize to mean 0, std 1 # if self.spectrum_normalization_mode == 'mean0_std1': # # s = preprocessing.scale(s, axis=0); # mean = np.mean(s) # std = np.std(s) # s = (s - mean) / std # # ## normalize by linear transform to [0,1] # elif self.spectrum_normalization_mode == 'linear_0_1': # s = s / np.max(s) # # ## normalize by linear transform to [-1,1] # elif self.spectrum_normalization_mode == 'linear_-1_1': # s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) # # elif self.spectrum_normalization_mode is not None: # raise ValueError( # 'unsupported spectrum normalization mode {}'.format( # self.spectrum_normalization_mode) # ) # # #print s.mean(axis=0) # #print s.std(axis=0) # # # include phase information if requested # if self.include_phase: # # normalize phase to [-1.1] # phase = phase / np.pi # s = np.vstack([s, phase]) # # # transpose to fit pylearn2 layout # s = np.transpose(s) # # print s.shape # # ### end of frequency spectrum branch ### # else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension # s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: # print 'frame size: {}'.format(frame_size) s = s.copy( ) # FIXME: THIS IS NECESSARY - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = compute_frames(s, frame_length=frame_size, hop_length=hop_size) # frames = librosa.util.frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if target_mode == 'next': frame_targets = np.empty(len(frames)) tmp = [] for f, frame in enumerate(frames): tmp.append(frame[:-1]) frame_targets[f] = frame[-1] frames = np.asarray(tmp) # print frames.shape # for f, frm in enumerate(frames): # print frm, frame_targets[f] # # FIXME: OK so far if flatten_channels: # add artificial channel dimension frames = frames.reshape( (frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': channel, # channel 'channel_name': channel_name, 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) if target_mode == 'next': for next in frame_targets: targets.append(next) else: multi_channel_frames.append(frames) if target_mode == 'next': multi_channel_targets.append(frame_targets) ### end of channel iteration ### # print np.asarray(multi_channel_frames, dtype=np.int) # # FIXME: OK so far if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis( multi_channel_frames, 0, len(multi_channel_frames.shape)) # print multi_channel_frames.shape log.info(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': 'all', # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) if target_mode == 'next': multi_channel_targets = np.asfarray( multi_channel_targets, dtype='float32') targets.append(multi_channel_targets.T) ### end of datafile iteration ### # print sequences[0].shape # print np.asarray(sequences[0], dtype=np.int) # # FIXME: looks OK # turn into numpy arrays sequences = np.vstack(sequences) # sequences = np.asarray(sequences).squeeze() # sequences = sequences.reshape(sequences.shape[0]*sequences.shape[1], sequences.shape[2]) print('sequences: {}'.format(sequences.shape)) labels = np.hstack(labels) self.labels = labels print('labels: {}'.format(labels.shape)) if target_mode == 'label': targets = labels.copy() ## copy targets to fit SequenceDataSpace(VectorSpace) structure (*, frame_size, 12) # targets = targets.reshape((targets.shape[0], 1)) # targets = np.repeat(targets, frame_size, axis=1) # print targets.shape # one_hot_formatter = OneHotFormatter(max(targets.max() + 1, len(label_map)), dtype=np.int) # one_hot_y = one_hot_formatter.format(targets) # print one_hot_y.shape ## copy targets to fit SequenceDataSpace(IndexSpace) structure -> (*, frame_size, 1) targets = targets.reshape((targets.shape[0], 1)) targets = np.repeat(targets, frame_size, axis=1) targets = targets.reshape((targets.shape[0], targets.shape[1], 1)) print(targets.shape) elif target_mode == 'next': targets = np.concatenate(targets) targets = targets.reshape((targets.shape[0], 1, targets.shape[1])) print('targets: {}'.format(targets.shape)) n_channels = sequences.shape[2] print('number of channels: {}'.format(n_channels)) # if layout == 'ft': # swap axes to (batch, feature, time, channels) # sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) source = ('features', 'targets') # space = CompositeSpace([ # # VectorSequenceSpace(dim=64), # SequenceSpace(VectorSpace(dim=64)), # VectorSpace(dim=12), # ]) if target_mode == 'label': space = CompositeSpace([ SequenceDataSpace(VectorSpace(dim=n_channels)), # SequenceDataSpace(VectorSpace(dim=12)), SequenceDataSpace(IndexSpace(dim=1, max_labels=12)), # SequenceDataSpace(IndexSpace(dim=512, max_labels=12)), ]) elif target_mode == 'next': space = CompositeSpace([ # does not work with VectorSpacesDataset # SequenceSpace(VectorSpace(dim=64)), # SequenceSpace(VectorSpace(dim=64)) SequenceDataSpace(VectorSpace(dim=n_channels)), SequenceDataSpace(VectorSpace(dim=n_channels)) # VectorSpace(dim=n_channels) ]) # source = ('features') # space = SequenceSpace(VectorSpace(dim=64)) print('sequences: {}'.format(sequences.shape)) print('targets: {}'.format(targets.shape)) # for i, seq in enumerate(sequences): # print np.asarray(seq, dtype=np.int) # print np.asarray(targets[i], dtype=np.int) # break # # FIXME: looks OK # SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)), if target_mode == 'label': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source)) elif target_mode == 'next': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source))
def get_target_space(self): return IndexSpace(max_labels=11, dim=1)
def __init__(self, chromosomes="ALL", dataset_name="snp", read_only=False, balance_classes=False, start=None, stop=None, shuffle=False, add_noise=False, rng=_default_seed, flip_labels=False): print "Loading %r chromosomes for %s" % (chromosomes, dataset_name) if start is not None and stop is not None: print "Start: %d, stop: %d" % (start, stop) assert isinstance(chromosomes, int) or chromosomes == "ALL",\ "Can only set chromosomes to be an integer or ALL" p = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name) data_files = glob(path.join(p, "chr*.npy")) label_file = path.join(p, "labels.npy") get_int = lambda y: int(''.join(x for x in y if x.isdigit())) data_files.sort(key=get_int) if (chromosomes == "ALL" or chromosomes > len(data_files)): chromosomes = len(data_files) self.y = np.atleast_2d(np.load(label_file)).T[start:stop] if flip_labels: self.y = (self.y + 1) % 2 self.Xs = () space = () source = () balanced_idx = None if balance_classes: num_classes = np.amax(self.y) + 1 class_counts = [len(np.where(self.y == i)[0].tolist()) for i in range(num_classes)] min_count = min(class_counts) balanced_idx = [] for i in range(num_classes): idx = np.where(self.y == i)[0].tolist()[:min_count] balanced_idx += idx balanced_idx.sort() assert len(balanced_idx) / min_count == num_classes assert len(balanced_idx) % min_count == 0 self.y = self.y[balanced_idx] for i in range(num_classes): assert len(np.where(self.y == i)[0].tolist()) == min_count if read_only: print "Format is read-only for %s" % which_set h5_path = path.join(p, "gen." + which_set + ".h5") if not path.isfile(h5_path): self.make_h5(data_files, h5_path, start=start, stop=stop) h5file = tables.openFile(h5_path) datas = [h5file.getNode("/", "Chr%d" % (c + 1)) for c in range(chromosomes)] self.Xs = tuple(data.X for data in datas) sizes = [h5file.getNode("/", "Sizes")[c] for c in range(chromosomes)] else: print "Format is on-memory for %s" % dataset_name sizes = [] for c in range(0, chromosomes): X = np.load(data_files[c])[start:stop, :] assert "%d" % (c+1) in data_files[c] if balanced_idx is not None: X = X[balanced_idx] assert X.shape[0] == self.y.shape[0],\ "Data and labels have different number of samples (%d vs %d)" %\ (X.shape[0], self.y.shape[0]) self.Xs = self.Xs + (X / 2.0,) sizes.append(X.shape[1]) print "%s samples are %d" % (dataset_name, self.y.shape[0]) space = tuple(VectorSpace(dim=size) for size in sizes) source = tuple("chromosomes_%d" % (c + 1) for c in range(chromosomes)) self.X_space = CompositeSpace(space) self.X_source = source space = space + (IndexSpace(dim=1, max_labels=2),) source = source + ("targets",) space = CompositeSpace(space) self.data_specs = (space, source) self.rng = make_np_rng(rng, which_method="random_integers") assert self.rng is not None # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_topo = False self._iter_targets = False self._iter_data_specs = self.data_specs if add_noise: if add_noise is True: add_noise = 0.05 self.convert = list(randomize_snps.RandomizeSNPs(input_space=x_space, corruption_prob=add_noise) for x_space in self.X_space.components) + [None] else: self.convert = None
def __init__(self, X=None, topo_view=None, y=None, latent = None, view_converter=None, axes=('b', 0, 1, 'c'), rng=_default_seed, preprocessor=None, fit_preprocessor=False, X_labels=None, y_labels=None): self.latent = latent self.X = X self.y = y self.view_converter = view_converter self.X_labels = X_labels self.y_labels = y_labels self._check_labels() if topo_view is not None: assert view_converter is None self.set_topological_view(topo_view, axes) else: assert X is not None, ("DenseDesignMatrix needs to be provided " "with either topo_view, or X") if view_converter is not None: # Get the topo_space (usually Conv2DSpace) from the # view_converter if not hasattr(view_converter, 'topo_space'): raise NotImplementedError("Not able to get a topo_space " "from this converter: %s" % view_converter) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = view_converter.topo_space else: self.X_topo_space = None # Update data specs, if not done in set_topological_view X_source = 'features' if X_labels is None: X_space = VectorSpace(dim=X.shape[1]) else: if X.ndim == 1: dim = 1 else: dim = X.shape[-1] X_space = IndexSpace(dim=dim, max_labels=X_labels) if y is None: space = X_space source = X_source else: if y.ndim == 1: dim = 1 else: dim = y.shape[-1] if y_labels is not None: y_space = IndexSpace(dim=dim, max_labels=y_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' Latent_space = VectorSpace(dim=latent.shape[-1]) Latent_source = 'latents' space = CompositeSpace((X_space, y_space, Latent_space)) source = (X_source, y_source, Latent_source) self.data_specs = (space, source) self.X_space = X_space self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method="random_integers") # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor