def step(self, h, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None): # act: (batch_size, 3*n_mixt) act = T.exp(T.dot(h, self.w_cond) + self.b_cond) a = act[:, :self.n_mixt] b = act[:, self.n_mixt:2*self.n_mixt] k = k_pre + self.position_gap * act[:, -self.n_mixt:] # u: (length_cond_sequence, 1, 1) u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 1) # phi: (length_cond_sequence, batch_size, n_mixt) temp = ((-b[:, 0] * (k[:, 0] - u) ** 2) * seq_cond_mask -1000 * ( 1 - seq_cond_mask)) phi = T.nnet.softmax(temp.T).T # phi: (length_cond_sequence, batch_size) phi *= seq_cond_mask # w: (batch_size, condition_n_features) w = T.sum(T.shape_padright(phi) * seq_cond, axis=0) if mask: k = mask[:, None]*k + (1-mask[:, None])*k_pre w = mask[:, None]*w + (1-mask[:, None])*w_pre if self.grad_clip: w = grad_clip(w, -self.grad_clip, self.grad_clip) return a, k, phi, w
def get_output_for(self, input, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.dot(input, self.W) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) if self.grad_clipping: activation = grad_clip(activation, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(activation) # def sgd_with_grad_clipping(loss_or_grads, params, learning_rate, rescale): # grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params) # updates = OrderedDict() # # grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) # not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) # grad_norm = T.sqrt(grad_norm) # scaling_num = rescale # scaling_den = T.maximum(rescale, grad_norm) # for n, (param, grad) in enumerate(zip(params, grads)): # grad = T.switch(not_finite, 0.1 * param, # grad * (scaling_num / scaling_den)) # updates[param] = param - learning_rate * grad # return updates # max_norm = 5.0 # grads = theano.gradient(loss, params) # grads = [lasagne.updates.norm_constraint(grad, max_norm, range(grad.ndim)) # for grad in grads] # updates = lasagne.updates.whatever(grads, params)
def define_train_test_funcs(self): # pYs = T.reshape(self.activation, (self.mask_y.shape[0] * self.batch_size, self.out_size)) pYs_y = T.reshape(self.activation_dim_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y)) pYs_pos = T.reshape(self.activation_dim_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos)) # tYs = T.reshape(self.X, (self.mask.shape[0] * self.batch_size, self.out_size)) tYs_y = T.reshape(self.Y_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y)) tYs_pos = T.reshape(self.Y_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos)) # tYs = T.reshape(self.Y, (self.mask_y.shape[0] * self.batch_size, self.out_size)) cost_y = self.categorical_crossentropy(pYs_y, tYs_y) cost_pos = self.categorical_crossentropy(pYs_pos, tYs_pos) cost = cost_y + cost_pos gparams = [] for param in self.params: gparam = T.grad(grad_clip(cost, -5.0, +5.0), param) gparams.append(gparam) lr = T.scalar("lr") # eval(): string to function optimizer = eval(self.optimizer) updates = optimizer(self.params, gparams, lr) #updates = sgd(self.params, gparams, lr) #updates = momentum(self.params, gparams, lr) #updates = rmsprop(self.params, gparams, lr) #updates = adagrad(self.params, gparams, lr) #updates = adadelta(self.params, gparams, lr) #updates = adam(self.params, gparams, lr) self.train = theano.function(inputs = [self.X, self.Y_y, self.Y_pos, self.mask, self.mask_y, lr, self.batch_size], givens = {self.is_train : np.cast['int32'](1)}, outputs = [cost,cost_y,cost_pos, self.activation_dim_y, self.activation_dim_pos], updates = updates)
def step(self, inputs, h_pre, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None): """ A single timestep. Parameters ---------- inputs: (batch_size, n_in) h_pre: (batch_size, n_hidden) mask: (batch_size,) k_pre: (batch_size, n_mixt) w_pre: (batch_size, n_in_cond) seq_cond: (length_cond_sequence, batch_size, n_in_cond) seq_cond_mask: (length_cond_sequence, batch_size) """ # inputs: (batch_size, n_in + n_in_cond) inputs = T.concatenate([inputs, w_pre], axis=1) # h: (batch_size, n_hidden) h = self.layer.step(inputs, h_pre, mask=mask, process_inputs=True) # act: (batch_size, 3*n_mixt) act = T.exp(T.dot(h, self.w_cond) + self.b_cond) a = act[:, :self.n_mixt] b = act[:, self.n_mixt:2 * self.n_mixt] k = k_pre + 0.1 * act[:, -self.n_mixt:] # u: (length_cond_sequence, 1, 1) u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2) # phi: (length_cond_sequence, batch_size, n_mixt) phi = T.sum(a * T.exp(-b * (k - u)**2), axis=-1) # phi: (length_cond_sequence, batch_size) phi = phi * seq_cond_mask # w: (batch_size, n_chars) w = T.sum(T.shape_padright(phi) * seq_cond, axis=0) if mask: k = mask[:, None] * k + (1 - mask[:, None]) * k_pre w = mask[:, None] * w + (1 - mask[:, None]) * w_pre w = grad_clip(w, -100, 100) return h, a, k, phi, w
def test_grad_clip(): x = theano.tensor.scalar() z = theano.tensor.grad(gradient.grad_clip(x, -1, 1) ** 2, x) z2 = theano.tensor.grad(x ** 2, x) f = theano.function([x], outputs=[z, z2]) if theano.config.mode != "FAST_COMPILE": topo = f.maker.fgraph.toposort() assert not any([isinstance(node.op, gradient.GradClip) for node in topo]) out = f(2.0) assert np.allclose(out, (1, 4)) assert not np.allclose(out[0], out[1])
def test_grad_clip(): x = theano.tensor.scalar() z = theano.tensor.grad(gradient.grad_clip(x, -1, 1)**2, x) z2 = theano.tensor.grad(x**2, x) f = theano.function([x], outputs=[z, z2]) if config.mode != "FAST_COMPILE": topo = f.maker.fgraph.toposort() assert not any( [isinstance(node.op, gradient.GradClip) for node in topo]) out = f(2.0) assert np.allclose(out, (1, 4)) assert not np.allclose(out[0], out[1])
def step(self, inputs, h_pre, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None): """ A single timestep. Parameters ---------- inputs: (batch_size, n_in) h_pre: (batch_size, n_hidden) mask: (batch_size,) k_pre: (batch_size, n_mixt) w_pre: (batch_size, n_in_cond) seq_cond: (length_cond_sequence, batch_size, n_in_cond) seq_cond_mask: (length_cond_sequence, batch_size) """ # inputs: (batch_size, n_in + n_in_cond) inputs = T.concatenate([inputs, w_pre], axis=1) # h: (batch_size, n_hidden) h = self.layer.step(inputs, h_pre, mask=mask, process_inputs=True) # act: (batch_size, 3*n_mixt) act = T.exp(T.dot(h, self.w_cond) + self.b_cond) a = act[:, :self.n_mixt] b = act[:, self.n_mixt:2*self.n_mixt] k = k_pre + 0.1*act[:, -self.n_mixt:] # u: (length_cond_sequence, 1, 1) u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2) # phi: (length_cond_sequence, batch_size, n_mixt) phi = T.sum(a * T.exp(-b * (k-u)**2), axis=-1) # phi: (length_cond_sequence, batch_size) phi = phi * seq_cond_mask # w: (batch_size, n_chars) w = T.sum(T.shape_padright(phi) * seq_cond, axis=0) if mask: k = mask[:, None]*k + (1-mask[:, None])*k_pre w = mask[:, None]*w + (1-mask[:, None])*w_pre w = grad_clip(w, -100, 100) return h, a, k, phi, w
def define_train_test_funcs(self): # pYs = T.reshape(self.activation, (self.mask_y.shape[0] * self.batch_size, self.out_size)) pYs_y = T.reshape(self.activation_dim_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y)) pYs_pos = T.reshape( self.activation_dim_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos)) # tYs = T.reshape(self.X, (self.mask.shape[0] * self.batch_size, self.out_size)) tYs_y = T.reshape(self.Y_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y)) tYs_pos = T.reshape( self.Y_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos)) # tYs = T.reshape(self.Y, (self.mask_y.shape[0] * self.batch_size, self.out_size)) cost_y = self.categorical_crossentropy(pYs_y, tYs_y) cost_pos = self.categorical_crossentropy(pYs_pos, tYs_pos) cost = cost_y + cost_pos gparams = [] for param in self.params: gparam = T.grad(grad_clip(cost, -5.0, +5.0), param) gparams.append(gparam) lr = T.scalar("lr") # eval(): string to function optimizer = eval(self.optimizer) updates = optimizer(self.params, gparams, lr) #updates = sgd(self.params, gparams, lr) #updates = momentum(self.params, gparams, lr) #updates = rmsprop(self.params, gparams, lr) #updates = adagrad(self.params, gparams, lr) #updates = adadelta(self.params, gparams, lr) #updates = adam(self.params, gparams, lr) self.train = theano.function( inputs=[ self.X, self.Y_y, self.Y_pos, self.mask, self.mask_y, lr, self.batch_size ], givens={self.is_train: np.cast['int32'](1)}, outputs=[ cost, cost_y, cost_pos, self.activation_dim_y, self.activation_dim_pos ], updates=updates)
def clipped_nesterov_momentum(loss, all_params, learning_rate, clip_range, momentum=0.9): # Adapted from Lasagne/lasagne/updates.py all_grads = theano.grad(grad_clip(loss, clip_range[0], clip_range[1]), all_params) updates = [] for param_i, grad_i in zip(all_params, all_grads): mparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) v = momentum * mparam_i - learning_rate * grad_i # new momemtum w = param_i + momentum * v - learning_rate * grad_i # new param values updates.append((mparam_i, v)) updates.append((param_i, w)) return updates
def clipped_nesterov_momentum(loss, all_params, learning_rate, clip_range, momentum=0.9): # Adapted from Lasagne/lasagne/updates.py all_grads = theano.grad(grad_clip(loss, clip_range[0], clip_range[1]), all_params) updates = [] for param_i, grad_i in zip(all_params, all_grads): mparam_i = theano.shared( np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) v = momentum * mparam_i - learning_rate * grad_i # new momemtum w = param_i + momentum * v - learning_rate * grad_i # new param values updates.append((mparam_i, v)) updates.append((param_i, w)) return updates
def step(self, inputs, h_pre, mask=None, process_inputs=False): """ A single timestep. Parameters ---------- inputs: (batch_size, n_in) h_pre: (batch_size, n_hidden) mask: (batch_size,) process_inputs: bool If true, will process the input. If possible, it is better to process the whole input sequence beforehand. But sometimes this is not suitable, for example at prediction time. """ n_out = h_pre.shape[1] if process_inputs: inputs = self.precompute_inputs(inputs) h_input = T.dot(h_pre, self.w_hid) sig = T.nnet.sigmoid gates = sig(inputs[:, :2 * n_out] + h_input[:, :2 * n_out]) r_gate = gates[:, :n_out] u_gate = gates[:, n_out:2 * n_out] h_new = T.tanh(inputs[:, 2 * n_out:] + r_gate * h_input[:, 2 * n_out:]) h = (1 - u_gate) * h_pre + u_gate * h_new if mask: h = T.switch(mask[:, None], h, h_pre) if self.grad_clipping: h = grad_clip(h, -self.grad_clipping, self.grad_clipping) return h
def step(self, inputs, h_pre, mask=None, process_inputs=False): """ A single timestep. Parameters ---------- inputs: (batch_size, n_in) h_pre: (batch_size, n_hidden) mask: (batch_size,) process_inputs: bool If true, will process the input. If possible, it is better to process the whole input sequence beforehand. But sometimes this is not suitable, for example at prediction time. """ n_out = h_pre.shape[1] if process_inputs: inputs = self.precompute_inputs(inputs) h_input = T.dot(h_pre, self.w_hid) sig = T.nnet.sigmoid gates = sig(inputs[:, :2*n_out] + h_input[:, :2*n_out]) r_gate = gates[:, :n_out] u_gate = gates[:, n_out:2*n_out] h_new = T.tanh(inputs[:, 2*n_out:] + r_gate * h_input[:, 2*n_out:]) h = (1-u_gate)*h_pre + u_gate*h_new if mask: h = T.switch(mask[:, None], h, h_pre) h = grad_clip(h, -self.grad_clipping, self.grad_clipping) return h
def step(self, h, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None): # act: (batch_size, 3*n_mixt) act = T.dot(h, self.w_cond) + self.b_cond if not self.normalize_att: act = T.exp(act) a = act[:, :self.n_mixt] b = act[:, self.n_mixt:2*self.n_mixt] k = k_pre + self.position_gap * act[:, -self.n_mixt:] else: a = T.nnet.softmax(act[:, :self.n_mixt]) b = 2. + 2. * T.tanh(act[:, self.n_mixt:2 * self.n_mixt]) k = k_pre + self.position_gap * ( 2. + 2. * T.tanh(act[:, self.n_mixt:2 * self.n_mixt])) # u: (length_cond_sequence, 1, 1) u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2) # phi: (length_cond_sequence, batch_size, n_mixt) phi = T.sum(a * T.exp(-b * (k-u)**2), axis=-1) # phi: (length_cond_sequence, batch_size) phi *= seq_cond_mask # # TODO (not in Graves) # phi = phi * seq_cond_mask + -1000*(1-seq_cond_mask) # phi = T.nnet.softmax(phi.T).T * seq_cond_mask # w: (batch_size, condition_n_features) w = T.sum(T.shape_padright(phi) * seq_cond, axis=0) if mask: k = mask[:, None]*k + (1-mask[:, None])*k_pre w = mask[:, None]*w + (1-mask[:, None])*w_pre if self.grad_clip: w = grad_clip(w, -self.grad_clip, self.grad_clip) return a, k, phi, w
def __init__(self, inputs=None, hiddens=None, params=None, outdir='outputs/lstm/', activation='relu', gate_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, direction='forward', clip_recurrent_grads=False): """ Initialize an LSTM. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`)] The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. hiddens : int or Tuple of (shape, `Theano.TensorType`) Int for the number of hidden units to use, or a tuple of shape, expression to route the starting hidden values from elsewhere. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str The location to produce outputs from training or running the :class:`LSTM`. If None, nothing will be saved. activation : str or callable The nonlinear (or linear) activation to perform for the hidden units. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. gate_activation : str or callable The activation to perform for the hidden gates (default sigmoid). See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing input-hidden model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent hidden-hidden model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and adding them together. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(LSTM, self).__init__(**initial_parameters) ################## # specifications # ################## backward = direction.lower() == 'backward' bidirectional = direction.lower() == 'bidirectional' ######################## # activation functions # ######################## # recurrent hidden activation functions! self.hidden_activation_func = get_activation_function(activation) self.gate_activation_func = get_activation_function(gate_activation) ########## # inputs # ########## # inputs are expected to have the shape (n_timesteps, batch_size, data) if len(self.inputs) > 1: raise NotImplementedError( "Expected 1 input, found %d. Please merge inputs before passing " "to the model!" % len(self.inputs)) # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input) input_shape, self.input = self.inputs[0] if isinstance(input_shape, int): self.input_size = ((None, ) * (self.input.ndim - 1)) + (input_shape, ) else: self.input_size = input_shape assert self.input_size is not None, "Need to specify the shape for at least the last dimension of the input!" # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.input.ndim == 1: self.input = unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) elif self.input.ndim == 2: self.input = unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = self.input_size[:2] + (prod(self.input_size[2:])) ########### # hiddens # ########### # have only 1 hiddens assert len( self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len( self.hiddens) self.hiddens = self.hiddens[0] # if hiddens is an int (hidden size parameter, not routing info) h_init = None if isinstance(self.hiddens, int): self.hidden_size = self.hiddens elif isinstance(self.hiddens, tuple): hidden_shape, h_init = self.hiddens if isinstance(hidden_shape, int): self.hidden_size = hidden_shape else: self.hidden_size = hidden_shape[-1] else: raise AssertionError( "Hiddens need to be an int or tuple of (shape, theano_expression), found %s" % type(self.hiddens)) # output shape is going to be 3D with (timesteps, batch_size, hidden_size) self.output_size = (None, None, self.hidden_size) ########################################################## # parameters - make sure to deal with params dict input! # ########################################################## # all input-to-hidden weights W_c, W_i, W_f, W_o = [ self.params.get( "W_%s" % sub, get_weights( weights_init=weights_init, shape=(self.input_size[-1], self.hidden_size), name="W_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval)) for sub in ['c', 'i', 'f', 'o'] ] # all hidden-to-hidden weights U_c, U_i, U_f, U_o = [ self.params.get( "U_%s" % sub, get_weights( weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval)) for sub in ['c', 'i', 'f', 'o'] ] # if bidirectional, make hidden-to-hidden weights again to go the opposite direction U_c_b, U_i_b, U_f_b, U_o_b = None, None, None, None if bidirectional: U_c_b, U_i_b, U_f_b, U_o_b = [ self.params.get( "U_%s_b" % sub, get_weights( weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_%s_b" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval)) for sub in ['c', 'i', 'f', 'o'] ] # biases b_c, b_i, b_f, b_o = [ self.params.get( "b_%s" % sub, get_bias(shape=(self.hidden_size, ), name="b_%s" % sub, init_values=r_bias_init)) for sub in ['c', 'i', 'f', 'o'] ] # clip gradients if we are doing that recurrent_params = [U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b = [ grad_clip(param, -clip, clip) if param is not None else None for param in recurrent_params ] # put all the parameters into our dictionary self.params = { "W_c": W_c, "W_i": W_i, "W_f": W_f, "W_o": W_o, "U_c": U_c, "U_i": U_i, "U_f": U_f, "U_o": U_o, "b_c": b_c, "b_i": b_i, "b_f": b_f, "b_o": b_o, } if bidirectional: self.params.update({ "U_c_b": U_c_b, "U_i_b": U_i_b, "U_f_b": U_f_b, "U_o_b": U_o_b, }) # make h_init the right sized tensor if h_init is None: h_init = zeros_like(dot(self.input[0], W_c)) c_init = zeros_like(dot(self.input[0], W_c)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_c = dot(self.input, W_c) + b_c x_i = dot(self.input, W_i) + b_i x_f = dot(self.input, W_f) + b_f x_o = dot(self.input, W_o) + b_o # now do the recurrent stuff (self.hiddens, _), self.updates = scan(fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_c, U_i, U_f, U_o], go_backwards=backward, name="lstm_scan", strict=True) # if bidirectional, do the same in reverse! if bidirectional: (hiddens_b, _), updates_b = scan(fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_c_b, U_i_b, U_f_b, U_o_b], go_backwards=not backward, name="lstm_scan_back", strict=True) # flip the hiddens to be the right direction hiddens_b = hiddens_b[::-1] # update stuff self.updates.update(updates_b) self.hiddens += hiddens_b log.info("Initialized an LSTM!")
# Calculate final hidden state value h = (1.0 - z) * h_t_1 + z * h_d y = (th.dot(Y, h) + b_y) return h, softmax(y) #def predict(, x_vec): # return symbolic output of th pass [h, out], _ = th.scan(gru_step, sequences=x, outputs_info=[h0, None]) #error = ((out - y)**2).sum() error = t.nnet.categorical_crossentropy(out, y).sum() # Implement adagrad and define symbolic updates which is a list of tuples grads = t.grad(error, params) #param_grads = grads param_grads = [grad_clip(grad, -5, 5) for grad in grads] # new_grad_hists = [g_hist + g ** 2 for g_hist, g in zip(grad_hists, param_grads)] # param_updates = [ # (param, param - gamma * param_grad/t.sqrt(g_hist + 1e-8)) # for param, param_grad, g_hist in zip(params, param_grads, grad_hists) # ] #Iplemening gradient clipping here param_updates = [ (param, param - 0.01 * param_grad) for param, param_grad in zip(params, param_grads) ] updates = param_updates
def __init__(self, inputs=None, hiddens=None, params=None, outdir='outputs/lstm/', activation='relu', gate_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='glorot', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='glorot', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, direction='forward', clip_recurrent_grads=False): """ Initialize an LSTM. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`)] The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. hiddens : int or Tuple of (shape, `Theano.TensorType`) Int for the number of hidden units to use, or a tuple of shape, expression to route the starting hidden values from elsewhere. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str The location to produce outputs from training or running the :class:`LSTM`. If None, nothing will be saved. activation : str or callable The nonlinear (or linear) activation to perform for the hidden units. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. gate_activation : str or callable The activation to perform for the hidden gates (default sigmoid). See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing input-hidden model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent hidden-hidden model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and adding them together. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(LSTM, self).__init__(**initial_parameters) ################## # specifications # ################## backward = direction.lower() == 'backward' bidirectional = direction.lower() == 'bidirectional' ######################## # activation functions # ######################## # recurrent hidden activation functions! self.hidden_activation_func = get_activation_function(activation) self.gate_activation_func = get_activation_function(gate_activation) ########## # inputs # ########## # inputs are expected to have the shape (n_timesteps, batch_size, data) if len(self.inputs) > 1: raise NotImplementedError("Expected 1 input, found %d. Please merge inputs before passing " "to the model!" % len(self.inputs)) # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input) input_shape, self.input = self.inputs[0] if isinstance(input_shape, int): self.input_size = ((None,) * (self.input.ndim - 1)) + (input_shape,) else: self.input_size = input_shape assert self.input_size is not None, "Need to specify the shape for at least the last dimension of the input!" # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.input.ndim == 1: self.input = unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) elif self.input.ndim == 2: self.input = unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: flat_in = Flatten((self.input_size, self.input), ndim=3) self.input = flat_in.get_outputs() self.input_size = flat_in.output_size ########### # hiddens # ########### # have only 1 hiddens assert len(self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(self.hiddens) self.hiddens = self.hiddens[0] # if hiddens is an int (hidden size parameter, not routing info) h_init = None if isinstance(self.hiddens, int): self.hidden_size = self.hiddens elif isinstance(self.hiddens, tuple): hidden_shape, h_init = self.hiddens if isinstance(hidden_shape, int): self.hidden_size = hidden_shape else: self.hidden_size = hidden_shape[-1] else: raise AssertionError("Hiddens need to be an int or tuple of (shape, theano_expression), found %s" % type(self.hiddens)) # output shape is going to be 3D with (timesteps, batch_size, hidden_size) self.output_size = (None, None, self.hidden_size) ########################################################## # parameters - make sure to deal with params dict input! # ########################################################## # all input-to-hidden weights W_c, W_i, W_f, W_o = [ self.params.get( "W_%s" % sub, get_weights(weights_init=weights_init, shape=(self.input_size[-1], self.hidden_size), name="W_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) ) for sub in ['c', 'i', 'f', 'o'] ] # all hidden-to-hidden weights U_c, U_i, U_f, U_o = [ self.params.get( "U_%s" % sub, get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) ) for sub in ['c', 'i', 'f', 'o'] ] # if bidirectional, make hidden-to-hidden weights again to go the opposite direction U_c_b, U_i_b, U_f_b, U_o_b = None, None, None, None if bidirectional: U_c_b, U_i_b, U_f_b, U_o_b = [ self.params.get( "U_%s_b" % sub, get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_%s_b" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) ) for sub in ['c', 'i', 'f', 'o'] ] # biases b_c, b_i, b_f, b_o = [ self.params.get( "b_%s" % sub, get_bias(shape=(self.hidden_size,), name="b_%s" % sub, init_values=r_bias_init) ) for sub in ['c', 'i', 'f', 'o'] ] # clip gradients if we are doing that recurrent_params = [U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b = [ grad_clip(param, -clip, clip) if param is not None else None for param in recurrent_params ] # put all the parameters into our dictionary self.params = { "W_c": W_c, "W_i": W_i, "W_f": W_f, "W_o": W_o, "U_c": U_c, "U_i": U_i, "U_f": U_f, "U_o": U_o, "b_c": b_c, "b_i": b_i, "b_f": b_f, "b_o": b_o, } if bidirectional: self.params.update( { "U_c_b": U_c_b, "U_i_b": U_i_b, "U_f_b": U_f_b, "U_o_b": U_o_b, } ) # make h_init the right sized tensor if h_init is None: h_init = zeros_like(dot(self.input[0], W_c)) c_init = zeros_like(dot(self.input[0], W_c)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_c = dot(self.input, W_c) + b_c x_i = dot(self.input, W_i) + b_i x_f = dot(self.input, W_f) + b_f x_o = dot(self.input, W_o) + b_o # now do the recurrent stuff (self.hiddens, _), self.updates = scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_c, U_i, U_f, U_o], go_backwards=backward, name="lstm_scan", strict=True ) # if bidirectional, do the same in reverse! if bidirectional: (hiddens_b, _), updates_b = scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_c_b, U_i_b, U_f_b, U_o_b], go_backwards=not backward, name="lstm_scan_back", strict=True ) # flip the hiddens to be the right direction hiddens_b = hiddens_b[::-1] # update stuff self.updates.update(updates_b) self.hiddens += hiddens_b log.info("Initialized an LSTM!")
def step(self, input, h_left, c_left, x_pos, h_buffer, c_buffer, width, mask=None, process_input=False): """ One time step. This function can be used as LSTMCell by setting `process_input=True`. :param input: (B, input_dim) :param h_left: (B, hidden_dim) :param c_left: (B, hidden_dim) :param x_pos: int64 scalar, width dimension :param h_buffer: (W, B, hidden_dim) :param c_buffer: (W, B, hidden_dim) :param width: width for x_pos rounding :param mask: (B,) :param process_input: If possible, it is better to process the whole input sequence beforehand. But sometimes this is not suitable, for example at prediction time. :return: h, c, both (B, hidden_dim) """ if process_input: input = self._precompute_input(input) # (B, 4*hidden_dim) h_up = h_buffer[x_pos, :, :] # (B, hidden_dim) c_up = c_buffer[x_pos, :, :] # (B, hidden_dim) gates = input + tensor.dot(h_left, self.W_h_left) + tensor.dot( h_up, self.W_h_up) # (B, 4*hidden_dim) if self.grad_clipping > 0: gates = grad_clip(gates, -self.grad_clipping, self.grad_clipping) i_gate = gates[:, :self.hidden_dim] # input gate, (B, hidden_dim) f_gate = gates[:, self.hidden_dim:2 * self.hidden_dim] # forget gate, (B, hidden_dim) c_input = gates[:, 2 * self.hidden_dim:3 * self.hidden_dim] # cell input, (B, hidden_dim) o_gate = gates[:, 3 * self.hidden_dim:] # output gate, (B, hidden_dim) if self.peephole: i_gate += (c_left * self.w_cell_to_igate_left + c_up * self.w_cell_to_igate_up) f_gate += (c_left * self.w_cell_to_fgate_left + c_up * self.w_cell_to_fgate_up) i_gate = sigmoid(i_gate) f_gate = sigmoid(f_gate) c_input = tanh(c_input) c = f_gate * ( c_up + c_left ) * 0.5 + i_gate * c_input # add 0.5 coefficient for numerical stability if self.peephole: o_gate += c * self.w_cell_to_ogate o_gate = sigmoid(o_gate) h = o_gate * self.hidden_activation(c) if mask: h = tensor.switch(mask[:, None], h, h_left) c = tensor.switch(mask[:, None], c, c_left) h_buffer = tensor.set_subtensor(h_buffer[x_pos, :, :], h) c_buffer = tensor.set_subtensor(c_buffer[x_pos, :, :], c) x_pos = x_pos + 1 x_pos = tensor.mod(x_pos, width) return h, c, x_pos, h_buffer, c_buffer
def __theano_build__(self): #Just making things more legible E, U, W, b, V, d = self.E, self.U, self.W, self.b, self.V, self.d x = T.ivector('x') #Input sequence stored as theano variable x y = T.ivector('y') #Target output value stored as theano variable y learnRate = T.scalar('learnRate') decayRate = T.scalar('decayRate') print("Loading forward_step") [out, s, C], updates = theano.scan( self.forward_step, sequences=x, truncate_gradient=4, outputs_info=[ None, dict(initial=theano.shared(value=np.zeros( self.hidden_dim).astype(theano.config.floatX))), dict(initial=theano.shared(value=np.ones( self.hidden_dim).astype(theano.config.floatX))) ]) pred = T.argmax(out, axis=1) #Predicts error of the output using categorical cross entropy pred_error = T.sum(T.nnet.categorical_crossentropy(out, y)) print("Loading f_pred") self.f_pred = theano.function([x], out) #Returns the class self.f_pred_class = theano.function([x], pred) #Define function for calculating error print("Loading ce_error") self.ce_error = theano.function([x, y], pred_error, allow_input_downcast=True) print("Loading gradients") #Gradients dE = grad_clip(T.grad(pred_error, E)) dW = grad_clip(T.grad(pred_error, W)) dU = grad_clip(T.grad(pred_error, U)) dV = grad_clip(T.grad(pred_error, V)) db = grad_clip(T.grad(pred_error, b)) dd = grad_clip(T.grad(pred_error, d)) # Adam cache updates beta1 = .9 beta2 = .999 eps = 1e-8 mE = grad_clip(beta1 * self.mE + (1 - beta1) * dE) mU = grad_clip(beta1 * self.mU + (1 - beta1) * dU) mW = grad_clip(beta1 * self.mW + (1 - beta1) * dW) mV = grad_clip(beta1 * self.mV + (1 - beta1) * dV) mb = grad_clip(beta1 * self.mb + (1 - beta1) * db) md = grad_clip(beta1 * self.md + (1 - beta1) * dd) vE = grad_clip(beta2 * self.vE + (1 - beta2) * (dE**2)) vU = grad_clip(beta2 * self.vU + (1 - beta2) * (dU**2)) vW = grad_clip(beta2 * self.vW + (1 - beta2) * (dW**2)) vV = grad_clip(beta2 * self.vV + (1 - beta2) * (dV**2)) vb = grad_clip(beta2 * self.vb + (1 - beta2) * (db**2)) vd = grad_clip(beta2 * self.vd + (1 - beta2) * (dd**2)) print("Loading adam_step") self.adam_step = theano.function( [x, y, learnRate], [], updates=[ (E, E - learnRate * mE / (T.sqrt(vE) + eps)), (U, U - learnRate * mU / (T.sqrt(vU) + eps)), (W, W - learnRate * mW / (T.sqrt(vW) + eps)), (V, V - learnRate * mV / (T.sqrt(vV) + eps)), (b, b - learnRate * mb / (T.sqrt(vb) + eps)), (d, d - learnRate * md / (T.sqrt(vd) + eps)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.md, md), (self.vE, vE), (self.vU, vU), (self.vW, vW), (self.vV, vV), (self.vb, vb), (self.vd, vd) ], allow_input_downcast=True)