def get_seq_mnist_streams(hidden_dim, batch_size=100, drop_prob=0.5): permutation = np.random.randint(0, 784, size=(784, )) train_set, valid_set, test_set = load_data('mnist.pkl.gz') train_x = train_set[0].reshape((50000 / batch_size, batch_size, 784)) train_x = np.swapaxes(train_x, 2, 1) train_x = train_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_y = (np.zeros(train_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label train_y[:, -1] = train_set[1] train_y = train_y.reshape((50000 / batch_size, batch_size, 784)) train_y = np.swapaxes(train_y, 2, 1) train_y = train_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_x = valid_set[0].reshape((10000 / batch_size, batch_size, 784)) valid_x = np.swapaxes(valid_x, 2, 1) valid_x = valid_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_y = (np.zeros(valid_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label valid_y[:, -1] = valid_set[1] valid_y = valid_y.reshape((10000 / batch_size, batch_size, 784)) valid_y = np.swapaxes(valid_y, 2, 1) valid_y = valid_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_x = train_x[:, permutation] valid_x = valid_x[:, permutation] train = IterableDataset({ 'x': train_x.astype(floatX), 'y': train_y[:, -1, :, 0].astype('int32') }) train_stream = DataStream(train) train_stream = SampleDrops(train_stream, drop_prob, hidden_dim, False) train_stream.sources = ('y', 'x', 'drops') train_stream.get_epoch_iterator().next() valid = IterableDataset({ 'x': valid_x.astype(floatX), 'y': valid_y[:, -1, :, 0].astype('int32') }) valid_stream = DataStream(valid) valid_stream = SampleDrops(valid_stream, drop_prob, hidden_dim, True) valid_stream.sources = ('y', 'x', 'drops') return train_stream, valid_stream
def get_seq_mnist_streams(hidden_dim, batch_size=100, drop_prob=0.5): permutation = np.random.randint(0, 784, size=(784,)) train_set, valid_set, test_set = load_data('mnist.pkl.gz') train_x = train_set[0].reshape((50000 / batch_size, batch_size, 784)) train_x = np.swapaxes(train_x, 2, 1) train_x = train_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_y = (np.zeros(train_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label train_y[:, -1] = train_set[1] train_y = train_y.reshape((50000 / batch_size, batch_size, 784)) train_y = np.swapaxes(train_y, 2, 1) train_y = train_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_x = valid_set[0].reshape((10000 / batch_size, batch_size, 784)) valid_x = np.swapaxes(valid_x, 2, 1) valid_x = valid_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_y = (np.zeros(valid_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label valid_y[:, -1] = valid_set[1] valid_y = valid_y.reshape((10000 / batch_size, batch_size, 784)) valid_y = np.swapaxes(valid_y, 2, 1) valid_y = valid_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_x = train_x[:, permutation] valid_x = valid_x[:, permutation] train = IterableDataset({'x': train_x.astype(floatX), 'y': train_y[:, -1, :, 0].astype('int32')}) train_stream = DataStream(train) train_stream = SampleDrops(train_stream, drop_prob, hidden_dim, False) train_stream.sources = ('y', 'x', 'drops') train_stream.get_epoch_iterator().next() valid = IterableDataset({'x': valid_x.astype(floatX), 'y': valid_y[:, -1, :, 0].astype('int32')}) valid_stream = DataStream(valid) valid_stream = SampleDrops(valid_stream, drop_prob, hidden_dim, True) valid_stream.sources = ('y', 'x', 'drops') return train_stream, valid_stream
def test_ngram_stream_error_on_multiple_sources(): # Check that NGram accepts only data streams with one source sentences = [list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7]] stream = DataStream(IterableDataset(sentences)) stream.sources = ('1', '2') assert_raises(ValueError, NGrams, 4, stream)
def test_ngram_stream_error_on_multiple_sources(): # Check that NGram accepts only data streams with one source sentences = [ list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7] ] stream = DataStream(IterableDataset(sentences)) stream.sources = ('1', '2') assert_raises(ValueError, NGrams, 4, stream)
def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,)
def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, )
def test_sources_setter(self): stream = DataStream(self.dataset) stream.sources = ('features', ) assert_equal(stream.sources, ('features', ))
def test_sources_setter(self): stream = DataStream(self.dataset) stream.sources = ('features',) assert_equal(stream.sources, ('features',))