class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].params[0].set_value( self.simple.params[0].get_value()) self.bidir.children[1].params[0].set_value( self.simple.params[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder)
def bilstm_layer(in_dim, inp, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n)+inp.name) lstm = LSTM(dim=h_dim, name='lstm' + str(n)+inp.name) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + inp.name initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ]
class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=Recurrent( dim=3, activation=Tanh())) self.simple = Recurrent(dim=3, weights_init=Orthogonal(), activation=Tanh()) self.bidir.initialize() self.simple.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def bilstm_layer(in_dim, inp, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref) lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + pref initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value( simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value( simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=Recurrent( dim=3, activation=Tanh())) self.simple = Recurrent(dim=3, weights_init=Orthogonal(), activation=Tanh()) self.bidir.initialize() self.simple.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0
def __init__(self, dimension, alphabet_size, **kwargs): super(WordReverser, self).__init__(**kwargs) encoder = Bidirectional( SimpleRecurrent(dim=dimension, activation=Tanh())) fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimension fork.output_dims = [dimension for name in fork.input_names] lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent( activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dimension, match_dim=dimension, name="attention") readout = Readout( readout_dim=alphabet_size, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, dimen, vocab_size): #{ # No idea what this is doing, but otherwise "allocated" is not set super(MorphGen, self).__init__(self) # The encoder encoder = Bidirectional(SimpleRecurrent(dim=dimen, activation=Tanh())) # What is this doing ? fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimen fork.output_dims = [encoder.prototype.get_dim(name) for name in fork.input_names] lookup = LookupTable(vocab_size, dimen) transition = SimpleRecurrent(dim=dimen, activation=Tanh(), name="transition") atten = SequenceContentAttention(state_names=transition.apply.states,attended_dim=2*dimen, match_dim=dimen, name="attention") readout = Readout( readout_dim=vocab_size, source_names=[transition.apply.states[0], atten.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, dimen), name="readout"); generator = SequenceGenerator(readout=readout, transition=transition, attention=atten,name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].parameters[0].set_value( self.simple.parameters[0].get_value()) self.bidir.children[1].parameters[0].set_value( self.simple.parameters[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent(dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value(simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value(simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones( (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
def __init__(self, enc_transition, dims, dim_input, subsample, **kwargs): super(Encoder, self).__init__(**kwargs) self.subsample = subsample for layer_num, (dim_under, dim) in enumerate( zip([dim_input] + list(2 * numpy.array(dims)), dims)): bidir = Bidirectional(RecurrentWithFork(enc_transition( dim=dim, activation=Tanh()).apply, dim_under, name='with_fork'), name='bidir{}'.format(layer_num)) self.children.append(bidir)
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(LSTMEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.fork = Fork(['inputs'], dimension, output_dims=[dimension], prototype=Linear(dimension, 4 * dimension)) encoder = Bidirectional(LSTM(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.fork]
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(SimpleEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.transform = MLP([Tanh()], [dimension, dimension]) self.fork = Fork(['inputs'], dimension, output_dims=[dimension], prototype=Linear(dimension, dimension)) encoder = Bidirectional( SimpleRecurrent(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.transform, self.fork]
def __init__(self, enc_transition, dims, dim_input, subsample, bidir, **kwargs): super(Encoder, self).__init__(**kwargs) self.subsample = subsample dims_under = [dim_input] + list( (2 if bidir else 1) * numpy.array(dims)) for layer_num, (dim_under, dim) in enumerate(zip(dims_under, dims)): layer = RecurrentWithFork(enc_transition(dim=dim, activation=Tanh()).apply, dim_under, name='with_fork{}'.format(layer_num)) if bidir: layer = Bidirectional(layer, name='bidir{}'.format(layer_num)) self.children.append(layer) self.dim_encoded = (2 if bidir else 1) * dims[-1]
class Encoder(Initializable): def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder) @application def apply(self, x, x_mask=None): if self.encoder_type is None: return x if self.encoder_type in ['lookup', 'bidirectional']: embed_x = self.embed_label.apply(x) if self.encoder_type == 'lookup': encoded_x = embed_x if self.encoder_type == 'bidirectional': encoded_x = self.encoder.apply(embed_x, x_mask) return encoded_x
def __init__(self, transition, dim_input, dims_hidden, rec_weights_init, ff_weights_init, biases_init, **kwargs): super(DeepBidirectional, self).__init__(**kwargs) for layer_num, (input_dim, hidden_dim) in enumerate( zip([dim_input] + list(2 * np.array(dims_hidden)), dims_hidden)): bidir = Bidirectional(RecurrentWithFork( transition=transition(dim=hidden_dim, activation=Tanh()), input_dim=input_dim, hidden_dim=hidden_dim, rec_weights_init=rec_weights_init, ff_weights_init=ff_weights_init, biases_init=biases_init, name='with_fork'), name='bidir{}'.format(layer_num)) self.children.append(bidir)
def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0
def __init__(self, src_vocab_size, embedding_dim, dgru_state_dim, state_dim, src_dgru_depth, bidir_encoder_depth, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.state_dim = state_dim self.dgru_state_dim = dgru_state_dim self.decimator = Decimator(src_vocab_size, embedding_dim, dgru_state_dim, src_dgru_depth) self.bidir = Bidirectional(RecurrentWithFork(GatedRecurrent( activation=Tanh(), dim=state_dim), dgru_state_dim, name='with_fork'), name='bidir0') self.children = [self.decimator, self.bidir] for layer_n in range(1, bidir_encoder_depth): self.children.append(copy.deepcopy(self.bidir)) for child in self.children[-1].children: child.input_dim = 2 * state_dim self.children[-1].name = 'bidir{}'.format(layer_n)
def __init__(self, dimension, input_size, rnn_type=None, embed_input=False, **kwargs): super(Encoder, self).__init__(**kwargs) if rnn_type is None: rnn_type = SimpleRecurrent if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) encoder = Bidirectional(rnn_type(dim=dimension, activation=Tanh())) fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ]) fork.input_dim = dimension fork.output_dims = [dimension for _ in fork.input_names] self.fork = fork self.encoder = encoder self.children = [fork, encoder, self.embedder]
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T - 2 train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 10 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = Bidirectional(LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) )) rnn.initialize() score_layer = Linear( input_dim=2*n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0) probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100, 110), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(110, 120), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
so let's think about sizes of the arrays... """ x = tensor.matrix('tokens', dtype="int32") x_mask = tensor.matrix('tokens_mask', dtype=floatX) #rnn.apply(inputs=input_to_hidden.apply(x), mask=x_mask) lookup = LookupTable(vocab_size, embedding_dim) x_extra = tensor.tensor3('extras', dtype=floatX) rnn = Bidirectional( SimpleRecurrent( dim=hidden_dim, activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), ), ) ### Will need to reshape the rnn outputs to produce suitable input here... gather = Linear(name='hidden_to_output', input_dim=hidden_dim * 2, output_dim=labels_size, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) p_labels = Softmax() ## Let's initialize the variables lookup.allocate()
Comments in google-groups:blocks indicate that a reshaping has to be done, so let's think about sizes of the arrays... """ x = tensor.matrix('tokens', dtype="int32") x_mask = tensor.matrix('tokens_mask', dtype=floatX) #rnn.apply(inputs=input_to_hidden.apply(x), mask=x_mask) lookup = LookupTable(vocab_size, embedding_dim) x_extra = tensor.tensor3('extras', dtype=floatX) rnn = Bidirectional( SimpleRecurrent(dim=hidden_dim, activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), ), ) ### Will need to reshape the rnn outputs to produce suitable input here... gather = Linear(name='hidden_to_output', input_dim=hidden_dim*2, output_dim=labels_size, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0) ) p_labels = Softmax() ## Let's initialize the variables
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()