def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor(x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(): nvis, nhid, nlat, learn_prior = 784, 200, 100, False theano_rng = MRG_RandomStreams(134663) # Initialize prior prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu') prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma') if learn_prior: add_role(prior_mu, PARAMETER) add_role(prior_log_sigma, PARAMETER) # Initialize encoding network encoding_network = MLP(activations=[Rectifier()], dims=[nvis, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_network.initialize() encoding_parameter_mapping = Fork( output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid, output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(), weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_parameter_mapping.initialize() # Initialize decoding network decoding_network = MLP(activations=[Rectifier()], dims=[nlat, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_network.initialize() decoding_parameter_mapping = Linear( input_dim=nhid, output_dim=nvis, name='mu_theta', weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_parameter_mapping.initialize() # Encode / decode x = tensor.matrix('features') h_phi = encoding_network.apply(x) mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi) epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) epsilon.name = 'epsilon' z = mu_phi + epsilon * tensor.exp(log_sigma_phi) z.name = 'z' h_theta = decoding_network.apply(z) mu_theta = decoding_parameter_mapping.apply(h_theta) # Compute cost kl_term = ( prior_log_sigma - log_sigma_phi + 0.5 * ( tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2 ) / tensor.exp(2 * prior_log_sigma) - 0.5 ).sum(axis=1) kl_term.name = 'kl_term' kl_term_mean = kl_term.mean() kl_term_mean.name = 'avg_kl_term' reconstruction_term = - ( x * tensor.nnet.softplus(-mu_theta) + (1 - x) * tensor.nnet.softplus(mu_theta)).sum(axis=1) reconstruction_term.name = 'reconstruction_term' reconstruction_term_mean = -reconstruction_term.mean() reconstruction_term_mean.name = 'avg_reconstruction_term' cost = -(reconstruction_term - kl_term).mean() cost.name = 'nll_upper_bound' # Datasets and data streams mnist_train = MNIST( 'train', start=0, stop=50000, binary=True, sources=('features',)) train_loop_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100)) train_monitor_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500)) mnist_valid = MNIST( 'train', start=50000, stop=60000, binary=True, sources=('features',)) valid_monitor_stream = DataStream( dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500)) mnist_test = MNIST('test', binary=True, sources=('features',)) test_monitor_stream = DataStream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500)) # Get parameters computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean] main_loop = MainLoop( model=None, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=200), DataStreamMonitoring( monitored_quantities, train_monitor_stream, prefix="train"), DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid"), DataStreamMonitoring( monitored_quantities, test_monitor_stream, prefix="test"), Printing()]) main_loop.run()
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def main(): nvis, nhid, nlat, learn_prior = 784, 200, 100, False theano_rng = MRG_RandomStreams(134663) # Initialize prior prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu') prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma') if learn_prior: add_role(prior_mu, PARAMETER) add_role(prior_log_sigma, PARAMETER) # Initialize encoding network encoding_network = MLP(activations=[Rectifier()], dims=[nvis, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_network.initialize() encoding_parameter_mapping = Fork( output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid, output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(), weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_parameter_mapping.initialize() # Initialize decoding network decoding_network = MLP(activations=[Rectifier()], dims=[nlat, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_network.initialize() decoding_parameter_mapping = Linear( input_dim=nhid, output_dim=nvis, name='mu_theta', weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_parameter_mapping.initialize() # Encode / decode x = tensor.matrix('features') h_phi = encoding_network.apply(x) mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi) epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) epsilon.name = 'epsilon' z = mu_phi + epsilon * tensor.exp(log_sigma_phi) z.name = 'z' h_theta = decoding_network.apply(z) mu_theta = decoding_parameter_mapping.apply(h_theta) # Compute cost kl_term = (prior_log_sigma - log_sigma_phi + 0.5 * (tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=1) kl_term.name = 'kl_term' kl_term_mean = kl_term.mean() kl_term_mean.name = 'avg_kl_term' reconstruction_term = -(x * tensor.nnet.softplus(-mu_theta) + (1 - x) * tensor.nnet.softplus(mu_theta)).sum( axis=1) reconstruction_term.name = 'reconstruction_term' reconstruction_term_mean = -reconstruction_term.mean() reconstruction_term_mean.name = 'avg_reconstruction_term' cost = -(reconstruction_term - kl_term).mean() cost.name = 'nll_upper_bound' # Datasets and data streams mnist_train = MNIST('train', start=0, stop=50000, binary=True, sources=('features', )) train_loop_stream = DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 100)) train_monitor_stream = DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 500)) mnist_valid = MNIST('train', start=50000, stop=60000, binary=True, sources=('features', )) valid_monitor_stream = DataStream(dataset=mnist_valid, iteration_scheme=SequentialScheme( mnist_valid.num_examples, 500)) mnist_test = MNIST('test', binary=True, sources=('features', )) test_monitor_stream = DataStream(dataset=mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)) # Get parameters computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean] main_loop = MainLoop(model=None, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=200), DataStreamMonitoring(monitored_quantities, train_monitor_stream, prefix="train"), DataStreamMonitoring(monitored_quantities, valid_monitor_stream, prefix="valid"), DataStreamMonitoring(monitored_quantities, test_monitor_stream, prefix="test"), Printing() ]) main_loop.run()
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') x = m.mean() + x #stupid mask not always needed... #embedding_size = 300 #glove_version = "glove.6B.300d.txt" embedding_size = 50 glove_version = "vectors.6B.50d.txt" wstd = 0.02 conv1 = Conv1D(filter_length=5, num_filters=128, input_dim=embedding_size, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0)) conv1.initialize() o = conv1.apply(x) o = Rectifier(name="conv1red").apply(o) o = MaxPooling1D(pooling_length=5 #, step=2 ).apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv2") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv2rec").apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv3") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv3rec").apply(o) fork = Fork(weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.), input_dim=128, output_dims=[128]*3, output_names=['inputs', 'reset_inputs', 'update_inputs'] ) fork.initialize() inputs, reset_inputs, update_inputs = fork.apply(o) out = o.mean(axis=1) #gru = GatedRecurrent(dim=128, #weights_init=IsotropicGaussian(0.02), #biases_init=IsotropicGaussian(0.0)) #gru.initialize() #states = gru.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs) #out = states[:, -1, :] hidden = Linear( input_dim = 128, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(out) o = Rectifier().apply(o) #hidden = Linear( #input_dim = 128, #output_dim = 128, #weights_init = IsotropicGaussian(std=0.02), #biases_init = Constant(0.), #name="hiddenmap2") #hidden.initialize() #o = hidden.apply(o) #o = Rectifier(name="rec2").apply(o) score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set) n_train = dataset.num_examples stream = DataStream( dataset=dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" #import ipdb #ipdb.set_trace() n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_stream, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()