def sg_rnn(tensor, opt): r"""Applies a simple rnn. Args: tensor: A 3-D `Tensor`. in_dim: A positive `integer`. The size of input dimension. dim: A positive `integer`. The size of output dimension. bias: Boolean. If True, biases are added. ln: Boolean. If True, layer normalization is applied. init_state: A 2-D `Tensor`. If None, the initial state is set to zeros. last_only: Boolean. If True, the outputs in the last time step are returned. Returns: A `Tensor`. If last_only is False, the output tensor has shape [batch size, time steps, dim]. If last_only is True, the shape will be [batch size, dim]. """ # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step function def step(h, x): # simple rnn ### Replace tensor[:, i, :] with x. bryan ### y = ln( tf.matmul(tensor[:, i, :], w) + tf.matmul(h, u) + (b if opt.bias else 0)) return y # parameter initialize w = init.orthogonal('W', (opt.in_dim, opt.dim)) u = init.identity('U', opt.dim) if opt.bias: b = init.constant('b', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, out = init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step func h = step(h, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out
def sg_rnn(tensor, opt): # parameter initialize w = init.orthogonal('W', (opt.in_dim, opt.dim)) u = init.identity('U', opt.dim) if opt.bias: b = init.constant('b', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # permute dimension for scan loop xx = tf.transpose(tensor, [1, 0, 2]) # step func def step(h, x): # layer normalization def ln(xx, opt): if opt.ln: # calc layer mean, variance for final axis mean, variance = tf.nn.moments(xx, axes=[len(xx.get_shape()) - 1]) # apply layer normalization ( explicit broadcasting needed ) broadcast_shape = [-1] + [1] * (len(xx.get_shape()) - 1) xx = (xx - tf.reshape(mean, broadcast_shape)) \ / tf.reshape(tf.sqrt(variance + tf.sg_eps), broadcast_shape) # apply parameter return gamma * xx + beta # apply transform y = ln(tf.matmul(x, w) + tf.matmul(h, u) + (b if opt.bias else 0), opt) return y # loop by scan out = tf.scan(step, xx, init_h) # recover dimension out = tf.transpose(out, [1, 0, 2]) # last sequence only if opt.last_only: out = out[:, tensor.get_shape().as_list()[1]-1, :] return out
def sg_rnn(tensor, opt): # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step function def step(h, x): # simple rnn y = ln( tf.matmul(tensor[:, i, :], w) + tf.matmul(h, u) + (b if opt.bias else 0)) return y # parameter initialize w = init.orthogonal('W', (opt.in_dim, opt.dim)) u = init.identity('U', opt.dim) if opt.bias: b = init.constant('b', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, out = init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step func h = step(h, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out
def sg_lstm(tensor, opt): r"""Applies an LSTM. Args: tensor: A 3-D `Tensor`. in_dim: A positive `integer`. The size of input dimension. dim: A positive `integer`. The size of output dimension. bias: Boolean. If True, biases are added. ln: Boolean. If True, layer normalization is applied. init_state: A 2-D `Tensor`. If None, the initial state is set to zeros. last_only: Boolean. If True, the outputs in the last time step are returned. Returns: A `Tensor`. If last_only is False, the output tensor has shape [batch size, time steps, dim]. If last_only is True, the shape will be [batch size, dim]. """ # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(h, c, x): # forget gate f = tf.sigmoid( ln( tf.matmul(x, w_f) + tf.matmul(h, u_f) + (b_f if opt.bias else 0))) # input gate i = tf.sigmoid( ln( tf.matmul(x, w_i) + tf.matmul(h, u_i) + (b_i if opt.bias else 0))) # new cell value cc = tf.tanh( ln( tf.matmul(x, w_c) + tf.matmul(h, u_c) + (b_c if opt.bias else 0))) # out gate o = tf.sigmoid( ln( tf.matmul(x, w_o) + tf.matmul(h, u_o) + (b_o if opt.bias else 0))) # cell update cell = f * c + i * cc # final output y = o * tf.tanh(cell) return y, cell # parameter initialize w_i = init.orthogonal('W_i', (opt.in_dim, opt.dim)) u_i = init.identity('U_i', opt.dim) w_f = init.orthogonal('W_f', (opt.in_dim, opt.dim)) u_f = init.identity('U_f', opt.dim) w_o = init.orthogonal('W_o', (opt.in_dim, opt.dim)) u_o = init.identity('U_o', opt.dim) w_c = init.orthogonal('W_c', (opt.in_dim, opt.dim)) u_c = init.identity('U_c', opt.dim) if opt.bias: b_i = init.constant('b_i', opt.dim) b_f = init.constant('b_f', opt.dim) b_o = init.constant('b_o', opt.dim, value=1) b_c = init.constant('b_c', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, c, out = init_h, init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h, c = step(h, c, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out
def sg_gru(tensor, opt): r"""Applies a GRU. Args: tensor: A 3-D `Tensor`. in_dim: A positive `integer`. The size of input dimension. dim: A positive `integer`. The size of output dimension. bias: Boolean. If True, biases are added. ln: Boolean. If True, layer normalization is applied. init_state: A 2-D `Tensor`. If None, the initial state is set to zeros. last_only: Boolean. If True, the outputs in the last time step are returned. Returns: A `Tensor`. If last_only is False, the output tensor has shape [batch size, time steps, dim]. If last_only is True, the shape will be [batch size, dim]. """ # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(h, x): # update gate z = tf.sigmoid( ln( tf.matmul(x, w_z) + tf.matmul(h, u_z) + (b_z if opt.bias else 0))) # reset gate r = tf.sigmoid( ln( tf.matmul(x, w_r) + tf.matmul(h, u_r) + (b_r if opt.bias else 0))) # h_hat hh = tf.tanh( ln( tf.matmul(x, w_h) + tf.matmul(r * h, u_h) + (b_h if opt.bias else 0))) # final output y = (1. - z) * h + z * hh return y # parameter initialize w_z = init.orthogonal('W_z', (opt.in_dim, opt.dim)) u_z = init.identity('U_z', opt.dim) w_r = init.orthogonal('W_r', (opt.in_dim, opt.dim)) u_r = init.identity('U_r', opt.dim) w_h = init.orthogonal('W_h', (opt.in_dim, opt.dim)) u_h = init.identity('U_h', opt.dim) if opt.bias: b_z = init.constant('b_z', opt.dim) b_r = init.constant('b_r', opt.dim) b_h = init.constant('b_h', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, out = init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h = step(h, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out
def sg_gru(tensor, opt): # parameter initialize w_z = init.orthogonal('W_z', (opt.in_dim, opt.dim)) u_z = init.identity('U_z', opt.dim) w_r = init.orthogonal('W_r', (opt.in_dim, opt.dim)) u_r = init.identity('U_r', opt.dim) w_h = init.orthogonal('W_h', (opt.in_dim, opt.dim)) u_h = init.identity('U_h', opt.dim) if opt.bias: b_z = init.constant('b_z', opt.dim) b_r = init.constant('b_r', opt.dim) b_h = init.constant('b_h', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # permute dimension for scan loop xx = tf.transpose(tensor, [1, 0, 2]) # step func def step(h, x): # layer normalization def ln(xx, opt): if opt.ln: # calc layer mean, variance for final axis mean, variance = tf.nn.moments(xx, axes=[len(xx.get_shape()) - 1]) # apply layer normalization ( explicit broadcasting needed ) broadcast_shape = [-1] + [1] * (len(xx.get_shape()) - 1) xx = (xx - tf.reshape(mean, broadcast_shape)) \ / tf.reshape(tf.sqrt(variance + tf.sg_eps), broadcast_shape) # apply parameter return gamma * xx + beta # update gate z = tf.sigmoid(ln(tf.matmul(x, w_z) + tf.matmul(h, u_z) + (b_z if opt.bias else 0), opt)) # reset gate r = tf.sigmoid(ln(tf.matmul(x, w_r) + tf.matmul(h, u_r) + (b_r if opt.bias else 0), opt)) # h_hat hh = tf.sigmoid(ln(tf.matmul(x, w_h) + tf.matmul(r*h, u_h) + (b_h if opt.bias else 0), opt)) # final output y = (1. - z) * h + z * hh return y # loop by scan out = tf.scan(step, xx, init_h) # recover dimension out = tf.transpose(out, [1, 0, 2]) # last sequence only if opt.last_only: out = out[:, tensor.get_shape().as_list()[1]-1, :] return out
def sg_lstm(tensor, opt): # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(h, c, x): # forget gate f = tf.sigmoid( ln( tf.matmul(x, w_f) + tf.matmul(h, u_f) + (b_f if opt.bias else 0))) # input gate i = tf.sigmoid( ln( tf.matmul(x, w_i) + tf.matmul(h, u_i) + (b_i if opt.bias else 0))) # new cell value cc = tf.tanh( ln( tf.matmul(x, w_c) + tf.matmul(h, u_c) + (b_c if opt.bias else 0))) # out gate o = tf.sigmoid( ln( tf.matmul(x, w_o) + tf.matmul(h, u_o) + (b_o if opt.bias else 0))) # cell update cell = f * c + i * cc # final output y = o * tf.tanh(cell) return y, cell # parameter initialize w_i = init.orthogonal('W_i', (opt.in_dim, opt.dim)) u_i = init.identity('U_i', opt.dim) w_f = init.orthogonal('W_f', (opt.in_dim, opt.dim)) u_f = init.identity('U_f', opt.dim) w_o = init.orthogonal('W_o', (opt.in_dim, opt.dim)) u_o = init.identity('U_o', opt.dim) w_c = init.orthogonal('W_c', (opt.in_dim, opt.dim)) u_c = init.identity('U_c', opt.dim) if opt.bias: b_i = init.constant('b_i', opt.dim) b_f = init.constant('b_f', opt.dim) b_o = init.constant('b_o', opt.dim, value=1) b_c = init.constant('b_c', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, c, out = init_h, init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h, c = step(h, c, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out
def sg_gru(tensor, opt): # layer normalization ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(h, x): # update gate z = tf.sigmoid( ln( tf.matmul(x, w_z) + tf.matmul(h, u_z) + (b_z if opt.bias else 0))) # reset gate r = tf.sigmoid( ln( tf.matmul(x, w_r) + tf.matmul(h, u_r) + (b_r if opt.bias else 0))) # h_hat hh = tf.tanh( ln( tf.matmul(x, w_h) + tf.matmul(r * h, u_h) + (b_h if opt.bias else 0))) # final output y = (1. - z) * h + z * hh return y # parameter initialize w_z = init.orthogonal('W_z', (opt.in_dim, opt.dim)) u_z = init.identity('U_z', opt.dim) w_r = init.orthogonal('W_r', (opt.in_dim, opt.dim)) u_r = init.identity('U_r', opt.dim) w_h = init.orthogonal('W_h', (opt.in_dim, opt.dim)) u_h = init.identity('U_h', opt.dim) if opt.bias: b_z = init.constant('b_z', opt.dim) b_r = init.constant('b_r', opt.dim) b_h = init.constant('b_h', opt.dim) # layer normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, out = init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h = step(h, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(dim=1)) # merge tensor if opt.last_only: out = out[-1].sg_squeeze(dim=1) else: out = tf.concat(1, out) return out