def build_model(alphabet_size, config): layers = config['lstm_layers'] dimensions = [config['lstm_dim_' + str(i)] for i in range(layers)] uniform_width = config['lstm_init_width'] stack = [] for dim in dimensions: stack.append(LSTM(dim=dim, use_bias=True, weights_init = Uniform(width=uniform_width), forget_init=Constant(1.))) recurrent_stack = RecurrentStack(stack, name='transition') readout = Readout(readout_dim=alphabet_size, source_names=['states#' + str(layers - 1)], emitter=SoftmaxEmitter(name='emitter'), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name='feedback'), name='readout') generator = SequenceGenerator(readout=readout, transition=recurrent_stack, weights_init=Uniform(width=uniform_width), biases_init=Constant(0), name='generator') generator.push_initialization_config() generator.initialize() x = tensor.lmatrix('features') mask = tensor.fmatrix('features_mask') cost_matrix = generator.cost_matrix(x, mask=mask) log2e = math.log(math.e, 2) if 'batch_length' in config: length = config['batch_length'] - config['batch_overlap'] cost = log2e * aggregation.mean(cost_matrix[:,-length:].sum(), mask[:,-length:].sum()) else: cost = log2e * aggregation.mean(cost_matrix[:,:].sum(), mask[:,:].sum()) cost.name = 'bits_per_character' return generator, cost
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.theano_seed = theano_seed self.transition = GatedRecurrent(dim=state_dim, activation=Tanh(), name='decoder') readout = Readout( source_names=['states'], readout_dim=self.vocab_size, merged_dim=state_dim) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear())) self.children = [self.sequence_generator] @application(inputs=['representation', 'source_sentence_mask', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, source_sentence_mask, target_sentence, target_sentence_mask): source_sentence_mask = source_sentence_mask.T target_sentence = target_sentence.T target_sentence_mask = target_sentence_mask.T cost = self.sequence_generator.cost_matrix(**{ 'mask': target_sentence_mask, 'outputs': target_sentence})
name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial x = tensor.lvector('features') x = x.reshape((x.shape[0], 1)) cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "negative log-likelihood" cost_cg = ComputationGraph(cost) print VariableFilter(roles=[WEIGHT])(cost_cg.variables) # theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True) algorithm = GradientDescent(cost=cost, parameters=list( Selector(seq_gen).get_parameters().values()), step_rule=Scale(0.001)) # AUDIOSCOPE OBSERVABLES (some) observables = [] observables += cost_cg.outputs observables.append(algorithm.total_step_norm)
generator.transition.push_initialization_config() generator.initialize() lookup.weights_init = IsotropicGaussian(0.001) lookup.biases_init = Constant(0.0) lookup.initialize() # states = {} states = [state for state in generator.transition.apply.outputs if state != "step"] # ipdb.set_trace() states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states} cost_matrix = generator.cost_matrix(x, attended=context, **states) cost = cost_matrix.mean() + 0.0 * start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")(cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * numpy.eye(hidden_size_recurrent, dtype=floatX)) from play.utils import regex_final_value extra_updates = []
lookup.biases_init = Constant(0.) lookup.initialize() #states = {} states = [ state for state in generator.transition.apply.outputs if state != "step" ] #ipdb.set_trace() states = { name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states } cost_matrix = generator.cost_matrix(x, attended=context, **states) cost = cost_matrix.mean() + 0. * start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")( cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * numpy.eye(hidden_size_recurrent, dtype=floatX)) from play.utils import regex_final_value extra_updates = []
class Decoder(Initializable): """Decoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator] @application(inputs=[ 'representation', 'source_sentence_mask', 'target_sentence_mask', 'target_sentence' ], outputs=['cost']) def cost(self, representation, source_sentence_mask, target_sentence, target_sentence_mask): source_sentence_mask = source_sentence_mask.T target_sentence = target_sentence.T target_sentence_mask = target_sentence_mask.T # Get the cost matrix cost = self.sequence_generator.cost_matrix( **{ 'mask': target_sentence_mask, 'outputs': target_sentence, 'attended': representation, 'attended_mask': source_sentence_mask }) return (cost * target_sentence_mask).sum() / \ target_sentence_mask.shape[1] @application def generate(self, source_sentence, representation, **kwargs): return self.sequence_generator.generate( n_steps=2 * source_sentence.shape[1], batch_size=source_sentence.shape[0], attended=representation, attended_mask=tensor.ones(source_sentence.shape).T, **kwargs)
class NoLookupDecoder(Initializable): """This is the decoder implementation without embedding layer or softmax. The target sentence is represented as a sequence of # vectors as defined by the sparse feature map. """ def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator] @application(inputs=[ 'representation', 'representation_mask', 'target_sentence_mask', 'target_sentence' ], outputs=['cost']) def cost(self, representation, representation_mask, target_sentence, target_sentence_mask): target_sentence = target_sentence.T target_sentence_mask = target_sentence_mask.T # Get the cost matrix cost = self.sequence_generator.cost_matrix( **{ 'mask': target_sentence_mask, 'outputs': target_sentence, 'attended': representation, 'attended_mask': representation_mask }) return (cost * target_sentence_mask).sum() / \ target_sentence_mask.shape[1] @application def generate(self, source_shape, representation, **kwargs): return self.sequence_generator.generate( n_steps=2 * source_shape[1], batch_size=source_shape[0], attended=representation, attended_mask=tensor.ones(source_shape).T, **kwargs)
name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01,1) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x) cost = cost_matrix.mean() cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# n_batches = 500 algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]))
def test_sequence_generator_with_lm(): floatX = theano.config.floatX rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) language_model = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, dim, name='feedback')), SimpleRecurrent(dim, Tanh()), name='language_model') generator = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states", "lm_states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, language_model=language_model, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') y.tag.test_value = numpy.zeros((15, batch_size), dtype='int64') mask = tensor.matrix('mask') mask.tag.test_value = numpy.ones((15, batch_size)) costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 483.153, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], cost)(y_test, m_test) assert_allclose(cost_val, 16.105, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.61051, rtol=1e-5) # Test generate states, outputs, lm_states, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph([states, outputs, costs]) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -4.88367, rtol=1e-5) assert_allclose(costs_val.sum(), 486.681, rtol=1e-5) assert outputs_val.sum() == 627 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize() #generator.transition.weights_init = initialization.Identity(0.98) #generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask, attended=embed, attended_mask=context_mask) cost = cost_matrix.sum(axis=0).mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")( cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * np.eye(hidden_size_recurrent, dtype=floatX)) readouts = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables)[0]
def test_sequence_generator_with_lm(): floatX = theano.config.floatX rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) language_model = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, dim, name='feedback')), SimpleRecurrent(dim, Tanh()), name='language_model') generator = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states", "lm_states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, language_model=language_model, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') y.tag.test_value = numpy.zeros((15, batch_size), dtype='int64') mask = tensor.matrix('mask') mask.tag.test_value = numpy.ones((15, batch_size)) costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 483.153, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], cost)(y_test, m_test) assert_allclose(cost_val, 16.105, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.61051, rtol=1e-5) # Test generate states, outputs, lm_states, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph([states, outputs, costs]) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -4.88367, rtol=1e-5) assert_allclose(costs_val.sum(), 486.681, rtol=1e-5) assert outputs_val.sum() == 627 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
source_names =source_names, emitter=emitter, feedback_brick = feedback, name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.initialize() #ipdb.set_trace() cost_matrix = generator.cost_matrix(x, x_mask) cost = cost_matrix.sum()/x_mask.sum() cost.name = "sequence_log_likelihood" ############## # Test with first batch ############## x_tr, x_mask_tr = next(data_stream.get_epoch_iterator()) f1 = function([x, x_mask], cost) #print f1(x_tr, x_mask_tr) #ipdb.set_trace() ################ # Optimization Algorithm
class NoLookupDecoder(Initializable): """This is the decoder implementation without embedding layer or softmax. The target sentence is represented as a sequence of # vectors as defined by the sparse feature map. """ def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState( attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention,src_names = _initialize_attention(attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator] @application(inputs=['representation', 'representation_mask', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, representation_mask, target_sentence, target_sentence_mask): target_sentence = target_sentence.T target_sentence_mask = target_sentence_mask.T # Get the cost matrix cost = self.sequence_generator.cost_matrix(**{ 'mask': target_sentence_mask, 'outputs': target_sentence, 'attended': representation, 'attended_mask': representation_mask} ) return (cost * target_sentence_mask).sum() / \ target_sentence_mask.shape[1] @application def generate(self, source_shape, representation, **kwargs): return self.sequence_generator.generate( n_steps=2 * source_shape[1], batch_size=source_shape[0], attended=representation, attended_mask=tensor.ones(source_shape).T, **kwargs)
class PyramidLayer(Initializable): """Basic unit for the pyramid model. """ def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(PyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 depth_context = depth hidden_size_mlp_context = 32*size context_size = 32*size activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta activations_context = [Rectifier()]*depth_context dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \ [context_size] mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") feedback = DeepTransitionFeedback(mlp = mlp_x) transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] transition = RecurrentStack( transition, name="transition", skip_connections = True) self.transition = transition mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) source_names = [name for name in transition.apply.states if 'states' in name] attention = SimpleSequenceAttention( state_names = source_names, state_dims = [hidden_size_recurrent], attended_dim = context_size, name = "attention") #ipdb.set_trace() # Verify source names readout = Readout( readout_dim = hidden_size_recurrent, source_names =source_names + ['feedback'] + ['glimpses'], emitter=gmm_emitter, feedback_brick = feedback, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") self.mlp_context = MLP(activations = activations_context, dims = dims_context) self.children = [self.generator, self.mlp_context] self.final_states = [] def monitoring_vars(self, cg): readout = self.generator.readout readouts = VariableFilter( applications = [readout.readout], name_regex = "output")(cg.variables)[0] mu, sigma, coeff = readout.emitter.components(readouts) min_sigma = sigma.min().copy(name="sigma_min") mean_sigma = sigma.mean().copy(name="sigma_mean") max_sigma = sigma.max().copy(name="sigma_max") min_mu = mu.min().copy(name="mu_min") mean_mu = mu.mean().copy(name="mu_mean") max_mu = mu.max().copy(name="mu_max") monitoring_vars = [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma] return monitoring_vars @application def cost(self, x, context, **kwargs): cost_matrix = self.generator.cost_matrix( x, attended=self.mlp_context.apply(context), **kwargs) return cost_matrix.mean() @application def generate(context): return self.generator.generate( attended = self.mlp_context.apply(context), n_steps = context.shape[0], batch_size = context.shape[1], iterate = True)
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop( algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100)]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(theano.config.floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=theano.config.floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01,1) generator.transition.push_initialization_config() generator.initialize() states = {} states = generator.transition.apply.outputs states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states} cost_matrix = generator.cost_matrix(x, **states) #cost_matrix = cost_matrix*voiced from theano import function cost = cost_matrix.mean() + 0.*start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter( theano_name_regex="state_to_state")(cg.parameters) for matr in transition_matrix: matr.set_value(0.98*numpy.eye(hidden_size_recurrent, dtype=floatX))
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [variable.eval() for variable in generator.generate( states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() #generator.transition.weights_init = initialization.Identity(0.98) #generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask) cost = cost_matrix.sum(axis = 0).mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter( theano_name_regex = "state_to_state")(cg.parameters) for matr in transition_matrix: matr.set_value(0.98*np.eye(hidden_size_recurrent, dtype = floatX)) readouts = VariableFilter( applications = [generator.readout.readout], name_regex = "output")(cg.variables)[0] mean, sigma, corr, weight, penup = emitter.components(readouts)
generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize() # generator.transition.weights_init = initialization.Identity(0.98) # generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask, attended=embed, attended_mask=context_mask) cost = cost_matrix.sum(axis=0).mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")(cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * np.eye(hidden_size_recurrent, dtype=floatX)) readouts = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables)[0] mean, sigma, corr, weight, penup = emitter.components(readouts) emit = generator.generate(
def test_with_attention(): """Test a sequence generator with continuous outputs and attention.""" rng = numpy.random.RandomState(1234) inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 # For values def rand(size): return rng.uniform(size=size).astype(floatX) # For masks def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask output_vals = rand((inp_len, batch_size, inp_dim)) output_mask_vals = generate_mask(inp_len, batch_size) attended_vals = rand((attended_len, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_len, batch_size) transition = TestTransition( dim=inp_dim, attended_dim=attended_dim, activation=Identity()) attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=inp_dim) generator = SequenceGenerator( Readout( readout_dim=inp_dim, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=TestEmitter()), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), add_contexts=False, seed=1234) generator.initialize() # Test 'cost_matrix' method attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") outputs = tensor.tensor3('outputs') mask = tensor.matrix('mask') costs = generator.cost_matrix(outputs, mask, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({outputs: output_vals, mask: output_mask_vals, attended: attended_vals, attended_mask: attended_mask_vals}) assert costs_vals.shape == (inp_len, batch_size) assert_allclose(costs_vals.sum(), 13.5042, rtol=1e-5) # Test `generate` method results = ( generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results) (attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size) assert_allclose(states_vals.sum(), 23.4172, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs_vals.sum(), 0.0, rtol=1e-5) assert_allclose(weights_vals.sum(), 120.0, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 199.2402, rtol=1e-5) assert_allclose(outputs_vals.sum(), -11.6008, rtol=1e-5)
generator.transition.push_initialization_config() generator.initialize() states = {} states = generator.transition.apply.outputs states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states} x_tr=next(data_stream.get_epoch_iterator()) #ipdb.set_trace() print function([f0,voiced], mlp_context.apply(context))(x_tr[0],x_tr[2]).shape cost_matrix = generator.cost_matrix(x, attended = mlp_context.apply(context))# , **states) print function([f0,x,voiced], cost_matrix)(x_tr[0],x_tr[1],x_tr[2]).shape cost = cost_matrix.mean() + 0.*start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter( theano_name_regex="state_to_state")(cg.parameters) for matr in transition_matrix: matr.set_value(0.98*numpy.eye(hidden_size_recurrent, dtype=floatX)) from play.utils import regex_final_value
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x) cost = cost_matrix.mean() cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# n_batches = 500 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(
generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() states = {} states = generator.transition.apply.outputs states = { name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states } cost_matrix = generator.cost_matrix(x, **states) cost = cost_matrix.mean() + 0. * start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")( cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * numpy.eye(hidden_size_recurrent, dtype=floatX)) from play.utils import regex_final_value extra_updates = [] for name, var in states.items():
generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.initialize() mlp_context.weights_init = IsotropicGaussian(0.01) mlp_context.biases_init = Constant(0.) mlp_context.initialize() #ipdb.set_trace() cost_matrix = generator.cost_matrix(x, x_mask, attended = mlp_context.apply(context)) cost = cost_matrix.sum()/x_mask.sum() cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)])) train_monitor = TrainingDataMonitoring(
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.transition = GRUInitialState( attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout( source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply]), merged_dim=state_dim) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator] @application(inputs=['representation', 'source_sentence_mask', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, source_sentence_mask, target_sentence, target_sentence_mask): source_sentence_mask = source_sentence_mask.T target_sentence = target_sentence.T target_sentence_mask = target_sentence_mask.T # Get the cost matrix cost = self.sequence_generator.cost_matrix( **{'mask': target_sentence_mask, 'outputs': target_sentence, 'attended': representation, 'attended_mask': source_sentence_mask} ) return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1] @application def generate(self, source_sentence, representation): return self.sequence_generator.generate( n_steps=2 * source_sentence.shape[1], batch_size=source_sentence.shape[0], attended=representation, attended_mask=tensor.ones(source_sentence.shape).T)
name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial x = tensor.lvector('features') x = x.reshape( (x.shape[0], 1) ) cost = aggregation.mean(seq_gen.cost_matrix(x[:,:]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" cost_cg = ComputationGraph(cost) # theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True) algorithm = GradientDescent( cost=cost, parameters=list(Selector(seq_gen).get_parameters().values()), step_rule=Scale(0.001)) # AUDIOSCOPE OBSERVABLES (some) observables = [] observables += cost_cg.outputs observables.append(algorithm.total_step_norm) observables.append(algorithm.total_gradient_norm)
generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.initialize() mlp_context.weights_init = IsotropicGaussian(0.01) mlp_context.biases_init = Constant(0.) mlp_context.initialize() #ipdb.set_trace() cost_matrix = generator.cost_matrix(x, x_mask, attended=mlp_context.apply(context)) cost = cost_matrix.sum() / x_mask.sum() cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0),
states = {} states = generator.transition.apply.outputs states = { name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states } x_tr = next(data_stream.get_epoch_iterator()) #ipdb.set_trace() print function([f0, voiced], mlp_context.apply(context))(x_tr[0], x_tr[2]).shape cost_matrix = generator.cost_matrix( x, attended=mlp_context.apply(context)) # , **states) print function([f0, x, voiced], cost_matrix)(x_tr[0], x_tr[1], x_tr[2]).shape cost = cost_matrix.mean() + 0. * start_flag cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) transition_matrix = VariableFilter(theano_name_regex="state_to_state")( cg.parameters) for matr in transition_matrix: matr.set_value(0.98 * numpy.eye(hidden_size_recurrent, dtype=floatX)) from play.utils import regex_final_value