def __init__(self, feature_size, embedding_dim, state_dim, **kwargs): super(BidirectionalPhonemeAudioEncoder, self).__init__(**kwargs) self.feature_size = feature_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.audio_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.audio_fwd_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_fwd_fork') self.audio_back_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_back_fork') self.phoneme_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="phoneme_embeddings") self.phoneme_fwd_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_fwd_fork') self.phoneme_back_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_back_fork') self.words_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="words_embeddings") self.words_fwd_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_fwd_fork') self.words_back_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_back_fork') self.children = [self.phoneme_embedding, self.audio_embedding, self.words_embedding, self.phoneme_fwd_fork, self.phoneme_back_fork, self.audio_fwd_fork, self.audio_back_fork, self.words_fwd_fork, self.words_back_fork]
def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalPhonesEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='phones_embeddings') self.embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.embedding_fwd_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_fwd_fork') self.embedding_back_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_back_fork') self.bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_representation") self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.embedding, self.fwd_fork, self.back_fork, self.embedding_fwd_fork, self.embedding_back_fork]
def setUp(self): self.gated = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=Constant(2)) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=IsotropicGaussian(), seed=1) self.reset_only.initialize()
def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, rng=numpy.random.RandomState(1)) self.reset_only.initialize()
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder)
def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # Dimension of the word embeddings taken as input self.embedding_dim = embedding_dim # Hidden state dimension self.state_dim = state_dim # The bidir GRU self.bidir = BidirectionalFromDict( GatedRecurrent(activation=Tanh(), dim=state_dim)) # Forks to administer the inputs of GRU gates self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork]
def __init__(self,hidden_size_recurrent, k, **kwargs): super(Scribe, self).__init__(**kwargs) readout_size =6*k+1 transition = [GatedRecurrent(dim=hidden_size_recurrent, name = "gru_{}".format(i) ) for i in range(3)] transition = RecurrentStack( transition, name="transition", skip_connections = True) emitter = BivariateGMMEmitter(k = k) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout( readout_dim = readout_size, source_names =source_names, emitter=emitter, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") self.children = [self.generator]
def __init__(self, embedding_dim, state_dim, **kwargs): """Constructor. Note that this implementation only supports single layer architectures. Args: embedding_dim (int): Dimensionality of the word vectors defined by the sparse feature map. state_dim (int): Size of the recurrent layer. """ super(NoLookupEncoder, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork]
def gru_layer(dim, h, n): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
def test_sequence_generator(): # Disclaimer: here we only check shapes, not values. output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent( name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( LinearReadout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter(name="emitter"), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.initialize() y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost(y, mask) assert costs.ndim == 2 costs_val = theano.function([y, mask], [costs])( numpy.zeros((n_steps, batch_size, output_dim), dtype=floatX), numpy.ones((n_steps, batch_size), dtype=floatX))[0] assert costs_val.shape == (n_steps, batch_size) states, outputs, costs = [variable.eval() for variable in generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size)
def __init__(self, k=20, rec_h_dim=400, att_size=10, num_letters=68, sampling_bias=0., attention_type="graves", epsilon=1e-6, attention_alignment=1., **kwargs): super(Scribe, self).__init__(**kwargs) # For now only softmax and graves are supported. assert attention_type in ["graves", "softmax"] readouts_dim = 1 + 6 * k self.k = k self.rec_h_dim = rec_h_dim self.att_size = att_size self.num_letters = num_letters self.sampling_bias = sampling_bias self.attention_type = attention_type self.epsilon = epsilon self.attention_alignment = attention_alignment self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1') self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=3, output_dims=[rec_h_dim, 2 * rec_h_dim], name='inp_to_h1') self.h1_to_readout = Linear(input_dim=rec_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rec_h_dim, output_dims=[att_size] * 3, name='h1_to_att') self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=num_letters, output_dims=[rec_h_dim, 2 * rec_h_dim], name='att_to_h1') self.att_to_readout = Linear(input_dim=num_letters, output_dim=readouts_dim, name='att_to_readout') self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias) self.children = [ self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att, self.att_to_h1, self.att_to_readout, self.emitter ]
def gru_layer(dim, h, n, x_mask, first, **kwargs): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) if first: gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs) else: gruApply = gru.apply(linear, gates, **kwargs) return gruApply
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(DeepBidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') self.bidirs = [] self.fwd_forks = [] self.back_forks = [] for i in xrange(self.n_layers): bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name='bidir%d' % i) self.bidirs.append(bidir) self.fwd_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork%d' % i)) self.back_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork%d' % i)) self.children = [self.lookup] \ + self.bidirs \ + self.fwd_forks \ + self.back_forks
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(GRUEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.fork = Fork(['inputs', 'gate_inputs'], dimension, output_dims=[dimension, 2 * dimension], prototype=Linear()) encoder = Bidirectional( GatedRecurrent(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.fork]
def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs): self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru') self.inner_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=inner_input_dim, name='inner_input_fork') self.outer_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=outer_input_dim, name='inner_outer_fork') super(InnerRecurrent, self).__init__(**kwargs) self.children = [ self.inner_gru, self.inner_input_fork, self.outer_input_fork]
def test_integer_sequence_generator(): # Disclaimer: here we only check shapes, not values. readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(LinearReadout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dim, feedback_dim), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.initialize() y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost(y, mask) assert costs.ndim == 2 costs_val = theano.function([y, mask], [costs])(numpy.zeros((n_steps, batch_size), dtype='int64'), numpy.ones((n_steps, batch_size), dtype=floatX))[0] assert costs_val.shape == (n_steps, batch_size) states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=costs.owner.inputs[0].owner.tag.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size)
def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [ self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform ] self.children.extend(self.rnn.children)
def __init__(self, src_vocab_size, embedding_dim, dgru_state_dim, state_dim, src_dgru_depth, bidir_encoder_depth, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.state_dim = state_dim self.dgru_state_dim = dgru_state_dim self.decimator = Decimator(src_vocab_size, embedding_dim, dgru_state_dim, src_dgru_depth) self.bidir = Bidirectional(RecurrentWithFork(GatedRecurrent( activation=Tanh(), dim=state_dim), dgru_state_dim, name='with_fork'), name='bidir0') self.children = [self.decimator, self.bidir] for layer_n in range(1, bidir_encoder_depth): self.children.append(copy.deepcopy(self.bidir)) for child in self.children[-1].children: child.input_dim = 2 * state_dim self.children[-1].name = 'bidir{}'.format(layer_n)
def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork]
def __init__(self, base_encoder, state_dim=1000, self_attendable=False, **kwargs): """Constructor. Args: base_encoder (Brick): Low level encoder network which produces annotations to attend to state_dim (int): Size of the recurrent layer. self_attendable (bool): If true, the annotator can attend to its own previous states. If false it can only attend to base annotations """ super(HierarchicalAnnotator, self).__init__(**kwargs) self.state_dim = state_dim * 2 self.base_encoder = base_encoder self.self_attendable = self_attendable trans_core = GatedRecurrent(activation=Tanh(), dim=self.state_dim) if self_attendable: self.attention = SelfAttendableContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, num_steps=10, name="hier_attention") else: self.attention = SequenceContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, name="hier_attention") self.transition = AttentionRecurrent(trans_core, self.attention, name="hier_att_trans") self.children = [self.transition]
def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(PyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 depth_context = depth hidden_size_mlp_context = 32*size context_size = 32*size activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta activations_context = [Rectifier()]*depth_context dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \ [context_size] mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") feedback = DeepTransitionFeedback(mlp = mlp_x) transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] transition = RecurrentStack( transition, name="transition", skip_connections = True) self.transition = transition mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) source_names = [name for name in transition.apply.states if 'states' in name] attention = SimpleSequenceAttention( state_names = source_names, state_dims = [hidden_size_recurrent], attended_dim = context_size, name = "attention") #ipdb.set_trace() # Verify source names readout = Readout( readout_dim = hidden_size_recurrent, source_names =source_names + ['feedback'] + ['glimpses'], emitter=gmm_emitter, feedback_brick = feedback, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") self.mlp_context = MLP(activations = activations_context, dims = dims_context) self.children = [self.generator, self.mlp_context] self.final_states = []
activations_x = [Rectifier()] * depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [hidden_size_recurrent] activations_theta = [Rectifier()] * depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta mlp_x = MLP(activations=activations_x, dims=dims_x) feedback = DeepTransitionFeedback(mlp=mlp_x) transition = [ GatedRecurrent(dim=hidden_size_recurrent, name="gru_{}".format(i)) for i in range(depth_recurrent) ] transition = RecurrentStack(transition, name="transition", skip_connections=True) mlp_theta = MLP(activations=activations_theta, dims=dims_theta) mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001) emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k, name="emitter")
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 filename = 'warpeace.hdf5' encoder = HDF5CharEncoder('warpeace_input.txt', 1000) encoder.write(filename) alphabet_len = encoder.length x = theano.tensor.lmatrix('x') readout = Readout( readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=['states'], emitter=RandomSoftmaxEmitter(), name='readout' ) transition = GatedRecurrent( activation=Tanh(), dim=hidden_size) transition.weights_init = IsotropicGaussian(0.01) gen = SequenceGenerator(readout=readout, transition=transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(0.5)) train_set = encoder.get_dataset() train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, igru_state_dim, state_dim, representation_dim, transition_depth, trg_igru_depth, trg_dgru_depth, trg_space_idx, trg_bos, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.igru_state_dim = igru_state_dim self.state_dim = state_dim self.trg_space_idx = trg_space_idx self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = RecurrentStack([ GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder_gru_withinit') ] + [ GatedRecurrent( dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i)) for i in range(1, transition_depth) ], skip_connections=False) # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.interpolator = Interpolator( vocab_size=vocab_size, embedding_dim=embedding_dim, igru_state_dim=igru_state_dim, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=trg_bos, theano_seed=theano_seed), feedback_brick=TargetWordEncoder(vocab_size, embedding_dim, self.dgru_state_dim, trg_dgru_depth)) # Build sequence generator accordingly self.sequence_generator = SequenceGeneratorDCNMT( trg_space_idx=self.trg_space_idx, readout=self.interpolator, transition=self.transition, attention=self.attention, transition_depth=transition_depth, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(SimplePyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta self.mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] self.transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) normal_inputs = [name for name in self.transition.apply.sequences if 'mask' not in name] self.fork = Fork(normal_inputs, input_dim = 4*hidden_size_recurrent, output_dims = self.transition.get_dims(normal_inputs)) self.children = [self.mlp_x, self.transition, self.gmm_emitter, self.fork]
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim * 2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative")
def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear(input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder(encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output ] self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear(input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear(input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear(input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm ], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output ] if full_feedback: self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [self.out_to_h1]