def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [ SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers) ] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** i) for i in range(args.layers)] elif args.module_order == "slow_in_fast": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** (args.layers - i - 1)) for i in range(args.layers)] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_lstm(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [LSTM(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(mask=x_mask, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} hidden_states = [] for d in range(args.layers): # TODO correct bug # h[5 * d] = h[5 * d] * x_mask # h[5 * d + 1] = h[5 * d + 1] * x_mask last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] h[5 * d].name = "hidden_state_" + str(d) h[5 * d + 1].name = "hidden_cell_" + str(d) hidden_states.extend([h[5 * d], h[5 * d + 1]]) # The updates of the hidden states # Note: if we have mask, then updating initial state # with last state does not make sence anymore. updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) updates.append((inits[1][d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = {"in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates} h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if args.layers > 1: if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state_all" presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, gate_values, hidden_states
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**i) for i in range(args.layers) ] elif args.module_order == "slow_in_fast": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**(args.layers - i - 1)) for i in range(args.layers) ] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_soft(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())] # Build the MLP dims = [2 * args.state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(args.state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(args.layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=args.state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} hidden_states = [] for d in range(args.layers): h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) # Concatenate all the states if args.layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, cross_entropy = get_costs(presoft, args) return cost, cross_entropy, updates, gate_values, hidden_states