def __init__(self, readout, transition, attention=None, fork_inputs=None, add_contexts=True, **kwargs): if not fork_inputs: fork_inputs = [ name for name in transition.apply.sequences if name != 'mask' ] fork = Fork(fork_inputs) if attention: distribute = Distribute(fork_inputs, attention.take_glimpses.outputs[0]) transition = AttentionRecurrent(transition, attention, distribute, add_contexts=add_contexts, name="att_trans") else: transition = FakeAttentionRecurrent(transition, name="with_fake_attention") super(SequenceGenerator, self).__init__(readout, transition, fork, **kwargs)
def __init__(self, trg_space_idx, readout, transition, attention=None, transition_depth=1, igru_depth=1, trg_dgru_depth=1, add_contexts=True, **kwargs): self.trg_space_idx = trg_space_idx self.transition_depth = transition_depth self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.igru_states_name = [ 'igru_states' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.igru_depth) ] self.feedback_name = [ 'feedback' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.trg_dgru_depth) ] normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] kwargs.setdefault('fork', Fork(normal_inputs)) transition = AttentionRecurrent(transition, attention, add_contexts=add_contexts, name="att_trans") super(SequenceGeneratorDCNMT, self).__init__(readout, transition, **kwargs)
def __init__(self, readout, transition, attention=None, add_contexts=True, **kwargs): normal_inputs = [name for name in transition.apply.sequences if 'mask' not in name] kwargs.setdefault('fork', Fork(normal_inputs)) if attention: transition = AttentionRecurrent( transition, attention, add_contexts=add_contexts, name="att_trans") else: transition = FakeAttentionRecurrent(transition, name="with_fake_attention") super(SequenceGenerator, self).__init__( readout, transition, **kwargs)
def __init__(self, readout, transition, attention, add_contexts=True, **kwargs): normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] kwargs.setdefault('fork', Fork(normal_inputs)) transition = AttentionRecurrent(transition, attention, add_contexts=add_contexts, name="att_trans") super(InitialContextSequenceGenerator, self).__init__(readout, transition, **kwargs)
def __init__(self, trg_space_idx, readout, transition, attention=None, transition_layers=1, add_contexts=True, **kwargs): self.trg_space_idx = trg_space_idx self.transition_layers = transition_layers normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] kwargs.setdefault('fork', Fork(normal_inputs)) transition = AttentionRecurrent(transition, attention, add_contexts=add_contexts, name="att_trans") super(SequenceGeneratorDCNMT, self).__init__(readout, transition, **kwargs)
def __init__(self, base_encoder, state_dim=1000, self_attendable=False, **kwargs): """Constructor. Args: base_encoder (Brick): Low level encoder network which produces annotations to attend to state_dim (int): Size of the recurrent layer. self_attendable (bool): If true, the annotator can attend to its own previous states. If false it can only attend to base annotations """ super(HierarchicalAnnotator, self).__init__(**kwargs) self.state_dim = state_dim * 2 self.base_encoder = base_encoder self.self_attendable = self_attendable trans_core = GatedRecurrent(activation=Tanh(), dim=self.state_dim) if self_attendable: self.attention = SelfAttendableContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, num_steps=10, name="hier_attention") else: self.attention = SequenceContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, name="hier_attention") self.transition = AttentionRecurrent(trans_core, self.attention, name="hier_att_trans") self.children = [self.transition]
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention(state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply(inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len( Selector(recurrent).get_params())) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i]) # freeze sums assert_allclose(weight_vals.sum(), input_length * batch_size, 1e-5) assert_allclose(states_vals.sum(), 113.429, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 415.901, rtol=1e-5)
def test_with_attention(): inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 transition = TestTransition( dim=inp_dim, attended_dim=attended_dim, activation=Identity()) attention = SequenceContentAttention( transition.apply.states, match_dim=inp_dim, name="attention") att_trans = AttentionRecurrent( transition, attention, add_contexts=False) att_trans.weights_init = IsotropicGaussian(0.01) att_trans.biases_init = Constant(0) att_trans.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") states, glimpses, weights = att_trans.apply( inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 input_vals = numpy.zeros((inp_len, batch_size, inp_dim), dtype=floatX) input_mask_vals = numpy.ones((inp_len, batch_size), dtype=floatX) attended_vals = numpy.zeros((attended_len, batch_size, attended_dim), dtype=floatX) attended_mask_vals = numpy.ones((attended_len, batch_size), dtype=floatX) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func( input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == input_vals.shape assert glimpses_vals.shape == (inp_len, batch_size, attended_dim) assert weight_vals.shape == (inp_len, batch_size, attended_len) # Test SequenceGenerator using AttentionTransition generator = SequenceGenerator( LinearReadout(readout_dim=inp_dim, source_names=["states"], emitter=TestEmitter(name="emitter"), name="readout"), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), add_contexts=False, name="generator") outputs = tensor.tensor3('outputs') costs = generator.cost(outputs, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({outputs: input_vals, attended: attended_vals, attended_mask: attended_mask_vals}) assert costs_vals.shape == (inp_len, batch_size) results = ( generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results) (attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size)
# seq_length * batch_size * features batch_size = 2 seq_length = n_steps features = 3 attended_tr = numpy.array( range(batch_size*seq_length*features)).astype('float32') attended_tr.shape = (seq_length, batch_size, features) from theano import tensor, function from blocks.bricks.attention import AttentionRecurrent attended = tensor.tensor3('attended') ssa = SimpleSequenceAttention(['states'],[3],3) ar = AttentionRecurrent( transition = transition, attention = ssa, ) ar.weights_init = initialization.Constant(0.) ar.biases_init = initialization.Constant(1.) ar.initialize() inputs = tensor.tensor3('inputs') #ar.apply(attended = attended_tv, n_steps = n_steps, batch_size = 2) states, glimpses, step = ar.initial_states(1, attended = attended) glimpses, step =ar.take_glimpses(attended = attended, states = states, glimpses = glimpses, step = step) states =ar.compute_states(inputs = inputs, attended = attended, states = states, glimpses = glimpses, step = step) distributed = ar.distribute.apply(inputs = inputs, glimpses = glimpses) states = ar.compute_states(states = states, inputs = inputs[0], glimpses = glimpses, step = step, attended = attended)