def bi_lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, **kwargs): n = namer(name) fwd_h = self.lstm(n('fwd'), x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, **kwargs) bck_h = self.lstm(n('bck'), x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=True, **kwargs) bi_h = tt.concatenate([fwd_h, bck_h], axis=2) # (timesteps, batch_size, 2*hidden_dim) return bi_h
def stacked_bi_lstm(self, name, x, x_mask, num_layers, input_dim, hidden_dim, drop_x, drop_h, **kwargs): n = namer(name) h = x for l in range(1, num_layers+1): h = self.bi_lstm(n('l%d' % l), h, x_mask, input_dim if l == 1 else 2*hidden_dim, hidden_dim, drop_x, drop_h, **kwargs) return h # (timesteps, batch_size, 2*hidden_dim)
def linear(self, name, x, input_dim, output_dim, with_bias=True, w_init='uniform', bias_init=0): # x (..., input_dim) n = namer(name) W = self.make_param(n('W'), (input_dim, output_dim), w_init) y = tt.dot(x, W) # (..., output_dim) if with_bias: b = self.make_param(n('b'), (output_dim,), bias_init) y += b return y
def stacked_bi_lstm(self, name, x, x_mask, num_layers, input_dim, hidden_dim, drop_x, drop_h, **kwargs): # can be changed: returning last hidden states for a stacked bi-lstm is unsupported if 'return_last' in kwargs: assert not kwargs['return_last'] n = namer(name) h = x for l in range(1, num_layers + 1): h = self.bi_lstm(n('l%d' % l), h, x_mask, input_dim if l == 1 else 2 * hidden_dim, hidden_dim, drop_x, drop_h, **kwargs) return h # (timesteps, batch_size, 2*hidden_dim)
def ff(self, name, x, dims, activation, dropout_ps, **kwargs): assert len(dims) >= 2 if dropout_ps: if isinstance(dropout_ps, numbers.Number): dropout_ps = [dropout_ps] * (len(dims) - 1) else: assert len(dropout_ps) == len(dims) - 1 n = namer(name) h = x if activation == 'relu': f = tt.nnet.relu elif activation == 'sigmoid': f = tt.nnet.sigmoid elif activation == 'tanh': f = tt.tanh else: raise AssertionError('unrecognized activation function') for i, (input_dim, output_dim) in enumerate(zip(dims[:-1], dims[1:])): if dropout_ps: h = self.dropout(h, dropout_ps[i]) h = f(self.linear(n('l%d' % (i+1)), h, input_dim, output_dim, **kwargs)) return h
def lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, couple_i_and_f=False, learn_initial_state=False, tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False, w_init='uniform', u_init='orthogonal', forget_bias_init=1, other_bias_init=0): """Customizable uni-directional LSTM layer. Handles masks, can learn initial state, input and forget gate can be coupled, with recurrent dropout, no peephole connections. Args: x: Theano tensor, shape (timesteps, batch_size, input_dim) x_mask: Theano tensor, shape (timesteps, batch_size) input_dim: int, dimension of input vectors hidden_dim: int, dimension of hidden state drop_x: float, dropout rate to apply to inputs drop_h: float, dropout rate to apply to hidden state backward: boolean, whether to recur over timesteps in reveresed order couple_i_and_f: boolean, whether to have input gate = 1 - forget gate learn_initial_state: boolean, whether to have initial cell state and initial hidden state as learnt parameters tie_x_dropout: boolean, whether to have the same dropout masks across timesteps for input sep_x_dropout: boolean, if True dropout is applied over weights of lin. trans. of input; otherwise it is applied over input activations sep_h_dropout: boolean, if True dropout is applied over weights of lin. trans. of hidden state; otherwise it is applied over hidden state activations w_init: string, initialization scheme for weights of lin. trans. of input u_init: string, initialization scheme for weights of lin. trans. of hidden state forget_bias_init: string, initialization scheme for forget gate's bias other_bias_init: string, initialization scheme for other biases Note: Proper variational dropout (Gal 2015) is: tie_x_dropout=True, sep_x_dropout=True, sep_h_dropout=True A faster alternative is: tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False Returns: h: Theano variable, recurrent hidden states at each timestep, shape (timesteps, batch_size, hidden_dim) """ n = namer(name) timesteps, batch_size = x.shape[0], x.shape[1] num_non_lin = 3 if couple_i_and_f else 4 num_gates = num_non_lin - 1 W = self.make_concat_param(n('W'), # (input_dim, [3|4]*hidden_dim) num_non_lin*[(input_dim, hidden_dim)], num_non_lin*[w_init], axis=1) b = self.make_concat_param(n('b'), # ([3|4]*hidden_dim,) num_non_lin*[(hidden_dim,)], [forget_bias_init] + num_gates*[other_bias_init], axis=0) U = self.make_concat_param(n('U'), # (hidden_dim, [3|4]*hidden_dim) num_non_lin*[(hidden_dim, hidden_dim)], num_non_lin*[u_init], axis=1) if not sep_x_dropout: if tie_x_dropout: x = self.apply_dropout_noise(x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x = self.dropout(x, drop_x) lin_x = tt.dot(x, W) + b # (timesteps, batch_size, [3|4]*hidden_dim) else: if tie_x_dropout: x_for_f = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_o = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) if num_gates == 3: x_for_i = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_g = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x_for_f = self.dropout(x, drop_x) x_for_o = self.dropout(x, drop_x) if num_gates == 3: x_for_i = self.dropout(x, drop_x) x_for_g = self.dropout(x, drop_x) lin_x_tensors = [tt.dot(x_for_f, W[:,:hidden_dim]), tt.dot(x_for_o, W[:,hidden_dim:2*hidden_dim])] if num_gates == 3: lin_x_tensors.append(tt.dot(x_for_i, W[:,2*hidden_dim:3*hidden_dim])) lin_x_tensors.append(tt.dot(x_for_g, W[:,num_gates*hidden_dim:])) lin_x = tt.concatenate(lin_x_tensors, axis=2) + b # (timesteps, batch_size, [3|4]*hidden_dim) def step_fn(lin_x_t, x_mask_t, h_tm1, c_tm1, h_noise, U): # lin_x_t (batch_size, [3|4]*hidden_dim) # x_mask_t (batch_size, 1) # h_tm1 (batch_size, hidden_dim) # c_tm1 (batch_size, hidden_dim) # h_noise (batch_size, [1|3|4]*hidden_dim) # 1 if not sep_h_dropout, otherwise: 3 or 4 depending on num_non_lin # U (hidden_dim, [3|4]*hidden_dim) if not sep_h_dropout: h_tm1 = self.apply_dropout_noise(h_tm1, h_noise) lin_h_tm1 = tt.dot(h_tm1, U) # (batch_size, [3|4]*hidden_dim) else: h_tm1_for_f = self.apply_dropout_noise(h_tm1, h_noise[:,:hidden_dim]) h_tm1_for_o = self.apply_dropout_noise(h_tm1, h_noise[:,hidden_dim:2*hidden_dim]) if num_gates == 3: h_tm1_for_i = self.apply_dropout_noise(h_tm1, h_noise[:,2*hidden_dim:3*hidden_dim]) h_tm1_for_g = self.apply_dropout_noise(h_tm1, h_noise[:,num_gates*hidden_dim:]) lin_h_tm1_tensors = [tt.dot(h_tm1_for_f, U[:,:hidden_dim]), tt.dot(h_tm1_for_o, U[:,hidden_dim:2*hidden_dim])] if num_gates == 3: lin_h_tm1_tensors.append(tt.dot(h_tm1_for_i, U[:,2*hidden_dim:3*hidden_dim])) lin_h_tm1_tensors.append(tt.dot(h_tm1_for_g, U[:,num_gates*hidden_dim:])) lin_h_tm1 = tt.concatenate(lin_h_tm1_tensors, axis=1) # (batch_size, [3|4]*hidden_dim) lin = lin_x_t + lin_h_tm1 # (batch_size, [3|4]*hidden_dim) gates = tt.nnet.sigmoid(lin[:, :num_gates*hidden_dim]) # (batch_size, [3|4]*hidden_dim) f_gate = gates[:, :hidden_dim] # (batch_size, hidden_dim) o_gate = gates[:, hidden_dim:2*hidden_dim] # (batch_size, hidden_dim) i_gate = gates[:, 2*hidden_dim:] if num_gates == 3 else 1 - f_gate # (batch_size, hidden_dim) g = tt.tanh(lin[:, num_gates*hidden_dim:]) # (batch_size, hidden_dim) c_t = f_gate * c_tm1 + i_gate * g h_t = o_gate * tt.tanh(c_t) h_t = tt.switch(x_mask_t, h_t, h_tm1) c_t = tt.switch(x_mask_t, c_t, c_tm1) return h_t, c_t # end of step_fn if learn_initial_state: h0 = self.make_param(n('h0'), (hidden_dim,), 0) c0 = self.make_param(n('c0'), (hidden_dim,), 0) batch_h0 = tt.extra_ops.repeat(h0[None,:], batch_size, axis=0) batch_c0 = tt.extra_ops.repeat(c0[None,:], batch_size, axis=0) else: batch_h0 = batch_c0 = tt.zeros((batch_size, hidden_dim)) x_mask = tt.shape_padright(x_mask) # (timesteps, batch_size, 1) original_x_mask = x_mask if backward: lin_x = lin_x[::-1] x_mask = x_mask[::-1] h_noise = self.get_dropout_noise( (batch_size, hidden_dim if not sep_h_dropout else num_non_lin*hidden_dim), drop_h) results, _ = theano.scan(step_fn, sequences = [lin_x, x_mask], outputs_info = [batch_h0, batch_c0], non_sequences = [h_noise, U], name = n('scan')) h = results[0] # (timesteps, batch_size, hidden_dim) if backward: h = h[::-1] h *= original_x_mask return h
def lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, return_last=False, couple_i_and_f=False, learn_initial_state=False, tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False, w_init='uniform', u_init='orthogonal', forget_bias_init=1, other_bias_init=0): """Customizable uni-directional LSTM layer. Handles masks, can learn initial state, input and forget gate can be coupled, with recurrent dropout, no peephole connections. Args: x: Theano tensor, shape (timesteps, batch_size, input_dim) x_mask: Theano tensor, shape (timesteps, batch_size) input_dim: int, dimension of input vectors hidden_dim: int, dimension of hidden state drop_x: float, dropout rate to apply to inputs drop_h: float, dropout rate to apply to hidden state backward: boolean, whether to recur over timesteps in reveresed order return_last: boolean, whether to in last hidden state in return value couple_i_and_f: boolean, whether to have input gate = 1 - forget gate learn_initial_state: boolean, whether to have initial cell state and initial hidden state as learnt parameters tie_x_dropout: boolean, whether to have the same dropout masks across timesteps for input sep_x_dropout: boolean, if True dropout is applied over weights of lin. trans. of input; otherwise it is applied over input activations sep_h_dropout: boolean, if True dropout is applied over weights of lin. trans. of hidden state; otherwise it is applied over hidden state activations w_init: string, initialization scheme for weights of lin. trans. of input u_init: string, initialization scheme for weights of lin. trans. of hidden state forget_bias_init: string, initialization scheme for forget gate's bias other_bias_init: string, initialization scheme for other biases Note: Proper variational dropout (Gal 2015) is: tie_x_dropout=True, sep_x_dropout=True, sep_h_dropout=True A faster alternative is: tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False Returns: h: Theano variable, recurrent hidden states at each timestep, shape (timesteps, batch_size, hidden_dim) """ n = namer(name) timesteps, batch_size = x.shape[0], x.shape[1] num_non_lin = 3 if couple_i_and_f else 4 num_gates = num_non_lin - 1 W = self.make_concat_param( n('W'), # (input_dim, [3|4]*hidden_dim) num_non_lin * [(input_dim, hidden_dim)], num_non_lin * [w_init], axis=1) b = self.make_concat_param( n('b'), # ([3|4]*hidden_dim,) num_non_lin * [(hidden_dim, )], [forget_bias_init] + num_gates * [other_bias_init], axis=0) U = self.make_concat_param( n('U'), # (hidden_dim, [3|4]*hidden_dim) num_non_lin * [(hidden_dim, hidden_dim)], num_non_lin * [u_init], axis=1) if not sep_x_dropout: if tie_x_dropout: x = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x = self.dropout(x, drop_x) lin_x = tt.dot(x, W) + b # (timesteps, batch_size, [3|4]*hidden_dim) else: if tie_x_dropout: x_for_f = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_o = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) if num_gates == 3: x_for_i = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_g = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x_for_f = self.dropout(x, drop_x) x_for_o = self.dropout(x, drop_x) if num_gates == 3: x_for_i = self.dropout(x, drop_x) x_for_g = self.dropout(x, drop_x) lin_x_tensors = [ tt.dot(x_for_f, W[:, :hidden_dim]), tt.dot(x_for_o, W[:, hidden_dim:2 * hidden_dim]) ] if num_gates == 3: lin_x_tensors.append( tt.dot(x_for_i, W[:, 2 * hidden_dim:3 * hidden_dim])) lin_x_tensors.append(tt.dot(x_for_g, W[:, num_gates * hidden_dim:])) lin_x = tt.concatenate( lin_x_tensors, axis=2) + b # (timesteps, batch_size, [3|4]*hidden_dim) def step_fn(lin_x_t, x_mask_t, h_tm1, c_tm1, h_noise, U): # lin_x_t (batch_size, [3|4]*hidden_dim) # x_mask_t (batch_size, 1) # h_tm1 (batch_size, hidden_dim) # c_tm1 (batch_size, hidden_dim) # h_noise (batch_size, [1|3|4]*hidden_dim) # 1 if not sep_h_dropout, otherwise: 3 or 4 depending on num_non_lin # U (hidden_dim, [3|4]*hidden_dim) if not sep_h_dropout: h_tm1 = self.apply_dropout_noise(h_tm1, h_noise) lin_h_tm1 = tt.dot(h_tm1, U) # (batch_size, [3|4]*hidden_dim) else: h_tm1_for_f = self.apply_dropout_noise(h_tm1, h_noise[:, :hidden_dim]) h_tm1_for_o = self.apply_dropout_noise( h_tm1, h_noise[:, hidden_dim:2 * hidden_dim]) if num_gates == 3: h_tm1_for_i = self.apply_dropout_noise( h_tm1, h_noise[:, 2 * hidden_dim:3 * hidden_dim]) h_tm1_for_g = self.apply_dropout_noise( h_tm1, h_noise[:, num_gates * hidden_dim:]) lin_h_tm1_tensors = [ tt.dot(h_tm1_for_f, U[:, :hidden_dim]), tt.dot(h_tm1_for_o, U[:, hidden_dim:2 * hidden_dim]) ] if num_gates == 3: lin_h_tm1_tensors.append( tt.dot(h_tm1_for_i, U[:, 2 * hidden_dim:3 * hidden_dim])) lin_h_tm1_tensors.append( tt.dot(h_tm1_for_g, U[:, num_gates * hidden_dim:])) lin_h_tm1 = tt.concatenate( lin_h_tm1_tensors, axis=1) # (batch_size, [3|4]*hidden_dim) lin = lin_x_t + lin_h_tm1 # (batch_size, [3|4]*hidden_dim) gates = tt.nnet.sigmoid( lin[:, :num_gates * hidden_dim]) # (batch_size, [3|4]*hidden_dim) f_gate = gates[:, :hidden_dim] # (batch_size, hidden_dim) o_gate = gates[:, hidden_dim:2 * hidden_dim] # (batch_size, hidden_dim) i_gate = gates[:, 2 * hidden_dim:] if num_gates == 3 else 1 - f_gate # (batch_size, hidden_dim) g = tt.tanh(lin[:, num_gates * hidden_dim:]) # (batch_size, hidden_dim) c_t = f_gate * c_tm1 + i_gate * g h_t = o_gate * tt.tanh(c_t) h_t = tt.switch(x_mask_t, h_t, h_tm1) c_t = tt.switch(x_mask_t, c_t, c_tm1) return h_t, c_t # end of step_fn if learn_initial_state: h0 = self.make_param(n('h0'), (hidden_dim, ), 0) c0 = self.make_param(n('c0'), (hidden_dim, ), 0) batch_h0 = tt.extra_ops.repeat(h0[None, :], batch_size, axis=0) batch_c0 = tt.extra_ops.repeat(c0[None, :], batch_size, axis=0) else: batch_h0 = batch_c0 = tt.zeros((batch_size, hidden_dim)) x_mask = tt.shape_padright(x_mask) # (timesteps, batch_size, 1) original_x_mask = x_mask if backward: lin_x = lin_x[::-1] x_mask = x_mask[::-1] h_noise = self.get_dropout_noise( (batch_size, hidden_dim if not sep_h_dropout else num_non_lin * hidden_dim), drop_h) results, _ = theano.scan(step_fn, sequences=[lin_x, x_mask], outputs_info=[batch_h0, batch_c0], non_sequences=[h_noise, U], name=n('scan')) h = results[0] # (timesteps, batch_size, hidden_dim) last_h = h[-1] # (batch_size, hidden_dim) if backward: h = h[::-1] h *= original_x_mask if return_last: return h, last_h return h