def __init__(self, in_dim, dim, forget_bias=1.0, activation=tf.tanh, ln=True, bias=True, dtype=tf.float32, dev='/cpu:0', batch_size=3): self._in_dim = in_dim self._dim = dim self._forget_bias = forget_bias self._activation = activation self._ln = False self._bias = bias self._dev = dev self._size = self._in_dim * self._dim self._initializer = tf.contrib.layers.xavier_initializer( ) #tf.random_normal_initializer() self._dtype = dtype with tf.device(self._dev): with tf.variable_scope("lstm") as scp: #self.rnn_state = tf.get_variable("rnn_c",(batch_size, self._dim), dtype=tf.sg_floatx,initializer=tf.constant_initializer(0.0),trainable=False) #self.rnn_h = tf.get_variable("rnn_h",(batch_size, self._dim), dtype=tf.sg_floatx,initializer=tf.constant_initializer(0.0),trainable=False) self.rnn_state, self.rnn_h = tf.zeros( (batch_size, self._dim), dtype=tf.sg_floatx), tf.zeros( (batch_size, self._dim), dtype=tf.sg_floatx) w_i2h = tf.get_variable( 'w_i2h', (self._in_dim, 4 * self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) w_h2h = tf.get_variable( 'w_h2h', (self._dim, 4 * self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) w_b = tf.get_variable( 'w_b', (1, 4 * self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) if self._bias == True else 0.0 if self._ln: with tf.variable_scope("ln_rnn"): beta = tf.get_variable( 'beta', self._dim, dtype=tf.sg_floatx, initializer=tf.constant_initializer(0.0), trainable=True) gamma = tf.get_variable( 'gamma', self._dim, dtype=tf.sg_floatx, initializer=tf.constant_initializer(1.0), trainable=True)
def sg_reuse(tensor, **opt): opt = tf.sg_opt(opt) assert hasattr(tensor, '_sugar'), 'cannot reuse this node.' assert opt.input is not None, 'input is mandatory.' # get all nodes in this graph nodes, prev = [tensor], tensor._sugar.prev while prev is not None: nodes = [prev] + nodes prev = prev._sugar.prev if hasattr(prev, '_sugar') else None # create graph again for this input out = opt.input for node in nodes[1:]: # exclude head node if node._sugar.is_layer: fn = tf.sg_layer_func(node._sugar.func) if node._sugar.arg.context_name: with tf.variable_scope(node._sugar.arg.context_name): out = fn( out, **(node._sugar.arg + tf.sg_opt(name=node._sugar.name, reuse=True))) else: out = fn( out, **(node._sugar.arg + tf.sg_opt(name=node._sugar.name, reuse=True))) else: out = node._sugar.func(out, node._sugar.arg) return out
def linear(input_, output_size, scope=None): ''' Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k] Args: args: a tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. ''' shape = input_.get_shape().as_list() if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shape)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape)) input_size = shape[1] # Now the computation. with tf.variable_scope(scope or "SimpleLinear"): matrix = tf.get_variable("Matrix", [output_size, input_size], dtype=input_.dtype) bias_term = tf.get_variable("Bias", [output_size], dtype=input_.dtype) return tf.matmul(input_, tf.transpose(matrix)) + bias_term
def symbols_to_logits_fn(ids, dec_state): dec = [] dec_c, dec_h = [], [] # (batch x beam_size x decoded_seq) ids = tf.reshape(ids, [Hp.batch_size, beam_size, -1]) print("dec_state ", dec_state[0].get_shape().as_list()) for ind in range(beam_size): with tf.variable_scope('dec_lstm', reuse=ind > 0 or reuse_vars): w_input = ids[:, ind, -1].sg_lookup(emb=emb_word) dec_state0 = tf.contrib.rnn.LSTMStateTuple( c=dec_state.c[:, ind, :], h=dec_state.h[:, ind, :]) dec_out, dec_state_i = dec_cell(w_input, dec_state0) dec_out = tf.expand_dims(dec_out, 1) dec_i = dec_out.sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=ind > 0 or reuse_vars) dec.append(tf.squeeze(dec_i, 1)) dec_c.append(dec_state_i[0]) dec_h.append(dec_state_i[1]) return tf.stack(dec, 1), tf.contrib.rnn.LSTMStateTuple( tf.stack(dec_c, 1), tf.stack(dec_h, 1))
def __call__(self, x_t, state, size, scope=None, reuse_vars=False): (prev_c, prev_h) = state scope = scope or tf.get_variable_scope() print("____reuse_______", reuse_vars) with tf.variable_scope(scope, reuse=True): w_ic = tf.get_variable("w_ic") w_fc = tf.get_variable("w_fc") w_oc = tf.get_variable("w_oc") with tf.sg_context(dev=self._dev, reuse=reuse_vars): i = x_t.sg_conv1d_gpus(name = "ix_",size=size)+\ prev_h.sg_conv1d_gpus(name = "ih_",size=size)+\ prev_c*w_ic f = x_t.sg_aconv1d_gpus(name = "fx_",size=size)+\ prev_h.sg_aconv1d_gpus(name = "fh_",size=size)+\ prev_c*w_fc c = x_t.sg_conv1d_gpus(name = "cx_",size=size)+\ prev_h.sg_conv1d_gpus(name = "ch_",size=size) o = x_t.sg_conv1d_gpus(name = "ox_",size=size)+\ prev_h.sg_conv1d_gpus(name = "oh_",size=size)+\ prev_c*w_oc new_c = prev_c * tf.sigmoid(f) + tf.sigmoid(i) * self._activation(c) new_h = self._activation(new_c) * tf.sigmoid(o) return (new_c, new_h)
def get_loss(opt): # encode audio feature with tf.variable_scope("model"): logit_clean = get_logit(opt.input[opt.gpu_index], voca_size=voca_size) loss_clean = logit_clean.sg_ctc(target=opt.target[opt.gpu_index], seq_len=opt.seq_len[opt.gpu_index]) with tf.variable_scope("model", reuse=True): logit_noise = get_logit(opt.input_noise[opt.gpu_index], voca_size=voca_size) loss_noise = logit_noise.sg_ctc(target=opt.target[opt.gpu_index], seq_len=opt.seq_len[opt.gpu_index]) # CTC loss loss_penalize = penalize_loss(opt.gamma, opt.lambd, logit_clean, logit_noise) return loss_clean + opt.alpha * loss_noise + loss_penalize
def get_logit(x, voca_size): with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE): # residual block def res_block(tensor, size, rate, block, dim=num_dim): with tf.sg_context(name='block_%d_%d' % (block, rate)): # filter convolution conv_filter = tensor.sg_aconv1d(size=size, rate=rate, act='tanh', bn=True, name='conv_filter') # gate convolution conv_gate = tensor.sg_aconv1d(size=size, rate=rate, act='sigmoid', bn=True, name='conv_gate') # output by gate multiplying out = conv_filter * conv_gate # final output out = out.sg_conv1d(size=1, dim=dim, act='tanh', bn=True, name='conv_out') # residual and skip output return out + tensor, out # expand dimension with tf.sg_context(name='front'): z = x.sg_conv1d(size=1, dim=num_dim, act='tanh', bn=True, name='conv_in') # dilated conv block loop skip = 0 # skip connections for i in range(num_blocks): for r in [1, 2, 4, 8, 16]: z, s = res_block(z, size=7, rate=r, block=i) skip += s # final logit layers with tf.sg_context(name='logit'): logit = (skip.sg_conv1d(size=1, act='tanh', bn=True, name='conv_1').sg_conv1d(size=1, dim=voca_size, name='conv_2')) return logit
def _linear(self, arys): scope = tf.get_variable_scope() with tf.variable_scope(scope, reuse=True): w_i2h = tf.get_variable("w_i2h") w_h2h = tf.get_variable("w_h2h") w_b = tf.get_variable("w_b") if self._bias == True else 0 i2h = tf.matmul(arys[0], w_i2h) h2h = tf.matmul(arys[1], w_h2h) out = i2h + h2h + w_b return out
def embed(inputs, vocab_size, embed_size, variable_scope): ''' inputs = tf.expand_dims(tf.range(5), 0) => (1, 5) _embed(inputs, 5, 10) => (1, 5, 10) ''' with tf.variable_scope(variable_scope): lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[vocab_size, embed_size], initializer=tf.truncated_normal_initializer()) return tf.nn.embedding_lookup(lookup_table, inputs)
def sg_context(**kwargs): global _context # set options when enter _context = tf.sg_opt(kwargs) if _context.name: _context.context_name = _context.name _context.name = None with tf.variable_scope(_context.context_name): yield else: yield # clear options when exit _context = tf.sg_opt()
def __init__(self, seqlen, in_dim, dim, forget_bias=1.0, activation=tf.tanh, ln=True, bias=True, dtype=tf.float32, dev='/cpu:0', batch_size=3): self._in_dim = in_dim self._dim = dim self._forget_bias = forget_bias self._activation = activation self._ln = ln self._dev = dev self._seqlen = seqlen self._bias = bias self._size = int(self._in_dim * self._dim) self._initializer = tf.contrib.layers.xavier_initializer( ) #tf.random_normal_initializer() self._dtype = dtype with tf.device(self._dev): with tf.variable_scope("clstm") as scp: #self.crnn_state = tf.get_variable("crnn_c",(batch_size, seqlen, self._dim), dtype=tf.sg_floatx,initializer=tf.constant_initializer(0.0),trainable=False) #self.crnn_h = tf.get_variable("crnn_h",(batch_size, seqlen, self._dim), dtype=tf.sg_floatx,initializer=tf.constant_initializer(0.0),trainable=False) w_ic = tf.get_variable( 'w_ic', (self._seqlen, self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) w_fc = tf.get_variable( 'w_fc', (self._seqlen, self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) w_oc = tf.get_variable( 'w_oc', (self._seqlen, self._dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), trainable=True) self.make_states(batch_size)
def sg_context(**kwargs): r"""Context helper for computational graph building. Makes all elements within the with Block share the parameters. For example, in the following example, the default value of parameter `bn` will be set to True in the all layers within the with block. ``` with tf.sg_context(bn=True): ... ... ``` Args: **kwargs: in_dim: An integer. The size of input dimension, which is set to the last one by default. dim: An integer. The size of output dimension. Has the same value as in_dim by default. bn: Boolean. If True, batch normalization is applied. ln: Boolean. If True, layer normalization is applied. dout: A float of range [0, 100). A dropout rate. Default is 0.. bias: Boolean. If True (Default), biases are added. name: A name for the layer. By default, the function name is assigned. act: A name of activation function. e.g., `sigmoid`, `tanh`, etc. reuse: `True` or `None`; if `True`, we go into reuse mode for this `layer` scope as well as all sub-scopes; if `None`, we just inherit the parent scope reuse. Returns: None """ global _context # set options when enter context_now = tf.sg_opt(kwargs) _context += [context_now] # if named context if context_now.name: context_now.scope_name = context_now.name context_now.name = None with tf.variable_scope(context_now.scope_name): yield else: yield # clear options when exit del _context[-1]
def __call__(self, tensor, state, scope=None): (prev_c, prev_h) = state # i = input_gate, c = new cell value for update, f = forget_gate, o = output_gate lstm_matrix = self._linear([tensor, prev_h]) i, c, f, o = tf.split(value=lstm_matrix, num_or_size_splits=4, axis=1) if self._ln: with tf.variable_scope("ln_rnn", reuse=True): beta = tf.get_variable('beta') gamma = tf.get_variable('gamma') ln = lambda v: _ln_rnn(v, gamma, beta) if self._ln else v # do rnn loop new_c = prev_c * tf.sigmoid(ln(f)) + tf.sigmoid( ln(i)) * self._activation(ln(c)) new_h = self._activation(new_c) * tf.sigmoid(ln(o)) return (new_c, new_h)
def sg_reuse(tensor, **opt): r""" Reconstruct computational graph of `tensor` so all the parameters can be reused and replace its input tensor with `opt.input`. Args: tensor: A `Tensor` (automatically given by chaining). **opt: input: A `Tensor` that will replace the original input tensor. Returns: Reconstructed tensor nodes. """ opt = tf.sg_opt(opt) assert hasattr(tensor, '_sugar'), 'cannot reuse this node.' assert opt.input is not None, 'input is mandatory.' # get all nodes in this graph nodes, prev = [tensor], tensor._sugar.prev while prev is not None: nodes = [prev] + nodes prev = prev._sugar.prev if hasattr(prev, '_sugar') else None # create graph again for this input out = opt.input for node in nodes[1:]: # exclude head node if node._sugar.is_layer: fn = tf.sg_layer_func(node._sugar.func) if node._sugar.arg.context_name: with tf.variable_scope(node._sugar.arg.context_name): out = fn( out, **(node._sugar.arg + tf.sg_opt(name=node._sugar.name, reuse=True))) else: out = fn( out, **(node._sugar.arg + tf.sg_opt(name=node._sugar.name, reuse=True))) else: out = node._sugar.func(out, node._sugar.arg) return out
def highway(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu, scope='Highway'): """Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. """ with tf.variable_scope(scope): for idx in range(num_layers): g = f(linear(input_, size, scope='highway_lin_%d' % idx)) t = tf.sigmoid( linear(input_, size, scope='highway_gate_%d' % idx) + bias) output = t * g + (1. - t) * input_ input_ = output return output
def wrapper(tensor, **kwargs): # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], dout=0) # disable bias when normalization on opt += tf.sg_opt(bias=not opt.ln) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.get_collection(tf.GraphKeys.VARIABLES): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + 'layers/' + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'layers/' prefix with tf.variable_scope('layers', reuse=opt.reuse): with tf.variable_scope(opt.name): # call layer function out = func(tensor, opt) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if opt.reuse is None or not opt.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def wrapper(tensor, **kwargs): r"""Manages arguments of `tf.sg_opt`. Args: tensor: automatically passed by decorator kwargs: in_dim: An integer. The size of input dimension, which is set to the last one by default. dim: An integer. The size of output dimension. Has the same value as in_dim by default. ln: Boolean. If True, layer normalization is applied. bias: Boolean. If True, biases are added. As a default, it is set to True name: A name for the layer. As a default, the function name is assigned. reuse: `True` or `None`; if `True`, we go into reuse mode for this `layer` scope as well as all sub-scopes; if `None`, we just inherit the parent scope reuse. """ # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], dout=0) # disable bias when normalization on opt += tf.sg_opt(bias=not opt.ln) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', 'lyr-') # find existing layer names exist_layers = [] for t in tf.global_variables(): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'lyr-' prefix with tf.variable_scope(opt.name, reuse=opt.reuse) as scope: # call layer function out = func(tensor, opt) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if scope.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def wrapper(tensor, **kwargs): r"""Manages arguments of `tf.sg_opt`. Args: tensor: A `tensor` (automatically passed by decorator). kwargs: shape: A list of integers. The shape of `tensor`. Inferred if not specified. in_dim: An integer. The size of input dimension, which is set to the last one by default. dim: An integer. The size of output dimension. Has the same value as in_dim by default. bn: Boolean. If True, batch normalization is applied. ln: Boolean. If True, layer normalization is applied. dout: A float of range [0, 100). A dropout rate. Set to 0 by default. bias: Boolean. If True, biases are added. As a default, it is set to True name: A name for the layer. As a default, the function name is assigned. act: A name of activation function. e.g., `sigmoid`, `tanh`, etc. reuse: `True` or `None`; if `True`, we go into reuse mode for this `layer` scope as well as all sub-scopes; if `None`, we just inherit the parent scope reuse. """ from . import sg_initializer as init from . import sg_activation # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # batch normalization off, layer normalization off, dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], bn=False, ln=False, dout=0) assert not ( opt.bn and opt.ln ), 'one of batch normalization and layer normalization is available.' # disable bias when normalization on opt += tf.sg_opt(bias=not (opt.bn or opt.ln)) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.global_variables(): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'lyr-' prefix with tf.variable_scope(opt.name, reuse=opt.reuse) as scope: # call layer function out = func(tensor, opt) # apply batch normalization if opt.bn: # offset, scale parameter beta = init.constant('beta', opt.dim, summary=False) gamma = init.constant('gamma', opt.dim, value=1, summary=False) # offset, scale parameter mean_running = init.constant('mean', opt.dim, summary=False) variance_running = init.constant('variance', opt.dim, value=1, summary=False) # calc batch mean, variance mean, variance = tf.nn.moments( out, axes=range(len(out.get_shape()) - 1)) # update running mean, variance def update_running_stat(): decay = 0.99 update_op = [ mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay)) ] with tf.control_dependencies(update_op): return tf.identity(mean), tf.identity(variance) # select mean, variance by training phase m, v = tf.cond( _phase, update_running_stat, # updated running stat and batch mean, variance lambda: (mean_running, variance_running)) # saved mean, variance # apply batch normalization out = tf.nn.batch_normalization(out, m, v, beta, gamma, tf.sg_eps) # apply normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim, summary=False) gamma = init.constant('gamma', opt.dim, value=1, summary=False) # calc layer mean, variance for final axis mean, variance = tf.nn.moments(out, axes=[len(out.get_shape()) - 1], keep_dims=True) # apply normalization out = (out - mean) / tf.sqrt(variance + tf.sg_eps) # apply parameter out = gamma * out + beta # apply activation if opt.act: out = getattr(sg_activation, 'sg_' + opt.act.lower())(out) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if not scope.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def generate(): dev = '/cpu:0' with tf.device(dev): mydir = 'tfrc150char_wrd0704' files = [f for f in listdir(mydir) if isfile(join(mydir, f))] tfrecords_filename = [] tfrecords_filename = [join(mydir, 'short_infer3.tfrecords') ] #[join(mydir, f) for f in tfrecords_filename] tfrecords_filename_inf = [join(mydir, '11_3.tfrecords')] print(tfrecords_filename) filename_queue = tf.train.string_input_producer(tfrecords_filename, num_epochs=num_epochs, shuffle=True, capacity=1) infer_queue = tf.train.string_input_producer(tfrecords_filename_inf, num_epochs=num_epochs, shuffle=True, capacity=1) optim = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.99) # Calculate the gradients for each model tower. tower_grads = [] reuse_vars = False with tf.variable_scope("dec_lstm") as scp: dec_cell = BasicLSTMCell2(Hp.w_emb_size, Hp.rnn_hd, state_is_tuple=True) with tf.variable_scope("contx_lstm") as scp: cell = BasicLSTMCell2(Hp.hd, Hp.rnn_hd, state_is_tuple=True) rnn_cell = tf.contrib.rnn.DropoutWrapper( cell, input_keep_prob=Hp.keep_prob, output_keep_prob=Hp.keep_prob) (words, chars) = read_and_decode(filename_queue, Hp.batch_size * Hp.num_gpus) words_splits = tf.split(axis=0, num_or_size_splits=Hp.num_gpus, value=words) chars_splits = tf.split(axis=0, num_or_size_splits=Hp.num_gpus, value=chars) word_emb = np.loadtxt("glove300d_0704.txt") Hp.word_vs = word_emb.shape[0] # -------------------------------------------------------------------------------- with tf.name_scope('%s_%d' % ("tower", 0)) as scope: rnn_state = tower_infer_enc(chars_splits[0], scope, rnn_cell, dec_cell, word_emb, out_reuse_vars=False, dev='/cpu:0') chars_pl = tf.placeholder(tf.int32, shape=(None, Hp.c_maxlen)) rnn_state_pl1 = [ tf.placeholder(tf.float32, shape=(None, Hp.rnn_hd)), tf.placeholder(tf.float32, shape=(None, Hp.rnn_hd)) ] rnn_state_pl = tf.contrib.rnn.LSTMStateTuple( rnn_state_pl1[0], rnn_state_pl1[1]) final_ids, rnn_state_dec = tower_infer_dec(chars_pl, scope, rnn_cell, dec_cell, word_emb, rnn_state_pl, out_reuse_vars=False, dev='/cpu:0') # -------------------------------------------------------------------------------- saver = tf.train.Saver(tf.trainable_variables()) session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.per_process_gpu_memory_fraction = 0.94 session_config.gpu_options.allow_growth = False restore_dir = 'tnsrbrd/hin17d08m_1313g2' # lec30d07m_1634g2 lec04d07m_2006g2 lec28d07m_1221g2 lec31d07m_1548g2 csv_file = join(restore_dir, time.strftime("hin%dd%mm_%H%M.csv")) csv_f = open(csv_file, 'a') csv_writer = csv.writer(csv_f) with tf.Session(config=session_config) as sess: sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) tf.train.start_queue_runners(sess=sess) saver.restore(sess, tf.train.latest_checkpoint( join(restore_dir, 'last_chpt'))) # lec04d07m_2006g2 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) for ep in range(num_epochs): tf.sg_set_infer(sess) rnn_state_val, w_txt, ch_txt = sess.run( [rnn_state, words_splits[0], chars_splits[0]], feed_dict={Hp.keep_prob: 1.0}) predictions = [] #[w_txt[:,2,:]] for idx in range(3): char_inpt = word2char_ids( ids_val) if idx != 0 else ch_txt[:, 2, :] ids_val, rnn_state_val = sess.run( [final_ids, rnn_state_dec], feed_dict={ Hp.keep_prob: 1.0, rnn_state_pl1[0]: rnn_state_val[0], rnn_state_pl1[1]: rnn_state_val[1], chars_pl: char_inpt }) temp = np.zeros((Hp.batch_size, Hp.w_maxlen)) for b in range(Hp.batch_size): stop_ind = np.where(ids_val[b] == 2)[0] if stop_ind.size > 0: stop_ind = stop_ind[0] ids_val[b, stop_ind + 1:] = ids_val[b, stop_ind + 1:] * 0 temp[:, :ids_val.shape[1]] = ids_val predictions.append(temp) # predictions are decode_sent x b x w_maxlen predictions = np.array(predictions) in_batches = [w_txt[b, :, :] for b in range(Hp.batch_size)] res_batches = [ predictions[:, b, :] for b in range(Hp.batch_size) ] for b in range(Hp.batch_size): in_paragraph = idxword2txt(in_batches[b]) print("\n INPUT SAMPLE \n") print(in_paragraph) res_paragraph = idxword2txt(res_batches[b]) print("\n RESULTS \n") print(res_paragraph) csv_writer.writerow([ " ".join(in_paragraph[:3]), " ".join(in_paragraph[3:]), " ".join(res_paragraph) ]) csv_f.close()
def tower_loss_manyparams(xx, scope, reu_vars=False): # make embedding matrix for source and target reu_vars = reu_vars with tf.variable_scope('embatch_size', reuse=reu_vars): # (vocab_size, latent_dim) emb_x = tf.sg_emb(name='emb_x', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) emb_y = tf.sg_emb(name='emb_y', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) xx = tf.cast(xx, tf.int32) time = tf.constant(0) losses_int = tf.constant(0.0) inputs = tf.transpose(xx, perm=[1, 0, 2]) input_ta = tensor_array_ops.TensorArray(tf.int32, size=1, dynamic_size=True, clear_after_read=False) x_sent = input_ta.unstack(inputs) #each element is (batch, sentlen) n_steps = tf.shape(xx)[1] # number of sentences in paragraph # generate first an unconditioned sentence n_input = Hp.hd subrec1_init = subrec_zero_state(Hp.batch_size, Hp.hd) subrec2_init = subrec_zero_state(Hp.batch_size, Hp.hd) with tf.variable_scope("mem", reuse=reu_vars) as scp: rnn_cell = LSTMCell(in_dim=h, dim=Hp.hd) crnn_cell = ConvLSTMCell(seqlen=Hp.maxlen, in_dim=n_input // 2, dim=Hp.hd // 2) (rnn_state_init, rnn_h_init) = rnn_cell.zero_state(Hp.batch_size) # (batch, sentlen, latentdim/2) (crnn_state_init, crnn_h_init) = crnn_cell.zero_state(Hp.batch_size) def rnn_cond(time, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h, losses): return tf.less(time, n_steps - 1) def rnn_body(time, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h, losses): x = x_sent.read(time) y = x_sent.read(time + 1) # (batch, sentlen) = (16, 200) # shift target by one step for training source y_src = tf.concat([tf.zeros((Hp.batch_size, 1), tf.int32), y[:, :-1]], 1) reuse_vars = time == tf.constant(0) or reu_vars # -------------------------- BYTENET ENCODER -------------------------- # embed table lookup enc = x.sg_lookup(emb=emb_x) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(num_blocks): enc = (enc.sg_res_block( size=5, rate=1, name="enc1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars)) # -------------------------- QCNN + QPOOL ENCODER with attention #1 -------------------------- #quasi cnn layer ZFO [batch * 3, t, dim2 ] conv = enc.sg_quasi_conv1d(is_enc=True, size=3, name="qconv_1", reuse_vars=reuse_vars) #attention layer # recurrent layer # 1 + final encoder hidden state subrec1 = tf.tile((subrec1.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) concat = conv.sg_concat(target=subrec1, axis=0) # (batch*4, sentlen, latentdim) pool = concat.sg_quasi_rnn(is_enc=True, att=True, name="qrnn_1", reuse_vars=reuse_vars) subrec1 = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- QCNN + QPOOL ENCODER with attention #2 -------------------------- # quazi cnn ZFO (batch*3, sentlen, latentdim) conv = pool.sg_quasi_conv1d(is_enc=True, size=2, name="qconv_2", reuse_vars=reuse_vars) # (batch, sentlen-duplicated, latentdim) subrec2 = tf.tile((subrec2.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) # (batch*4, sentlen, latentdim) concat = conv.sg_concat(target=subrec2, axis=0) pool = concat.sg_quasi_rnn(is_enc=True, att=True, name="qrnn_2", reuse_vars=reuse_vars) subrec2 = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- ConvLSTM with RESIDUAL connection and MULTIPLICATIVE block -------------------------- #residual block causal = False # for encoder crnn_input = (pool[:Hp.batch_size, :, :].sg_bypass_gpus( name='relu_0', act='relu', bn=(not causal), ln=causal).sg_conv1d_gpus(name="dimred_0", size=1, dev="/cpu:0", reuse=reuse_vars, dim=Hp.hd / 2, act='relu', bn=(not causal), ln=causal)) # conv LSTM with tf.variable_scope("mem/clstm") as scp: (crnn_state, crnn_h) = crnn_cell(crnn_input, (crnn_state, crnn_h), size=5, reuse_vars=reuse_vars) # dimension recover and residual connection rnn_input0 = pool[:Hp.batch_size,:,:] + crnn_h\ .sg_conv1d_gpus(name = "diminc_0",size=1,dev="/cpu:0", dim=Hp.hd,reuse=reuse_vars, act='relu', bn=(not causal), ln=causal) # -------------------------- QCNN + QPOOL ENCODER with attention #3 -------------------------- # pooling for lstm input # quazi cnn ZFO (batch*3, sentlen, latentdim) conv = rnn_input0.sg_quasi_conv1d(is_enc=True, size=2, name="qconv_3", reuse_vars=reuse_vars) pool = conv.sg_quasi_rnn(is_enc=True, att=False, name="qrnn_3", reuse_vars=reuse_vars) rnn_input = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- LSTM with RESIDUAL connection and MULTIPLICATIVE block -------------------------- # recurrent block with tf.variable_scope("mem/lstm") as scp: (rnn_state, rnn_h) = rnn_cell(rnn_input, (rnn_state, rnn_h)) rnn_h2 = tf.tile(((rnn_h + rnn_input).sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) # -------------------------- BYTENET DECODER -------------------------- # CNN decoder dec = y_src.sg_lookup(emb=emb_y).sg_concat(target=rnn_h2, name="dec") for i in range(num_blocks): dec = (dec.sg_res_block( size=3, rate=1, causal=True, name="dec1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=2, causal=True, name="dec2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=4, causal=True, name="dec4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=8, causal=True, name="dec8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=16, causal=True, name="dec16_%d" % (i), reuse_vars=reuse_vars)) # final fully convolution layer for softmax dec = dec.sg_conv1d_gpus(size=1, dim=Hp.vs, name="out", summary=False, dev=self._dev, reuse=reuse_vars) ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example") cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy') losses = tf.add_n([losses, cross_entropy_mean], name='total_loss') return (time + 1, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h, losses)
def tower_infer_dec(chars, scope, rnn_cell, dec_cell, word_emb, rnn_state, out_reuse_vars=False, dev='/cpu:0'): with tf.device(dev): with tf.variable_scope('embatch_size', reuse=True): # (vocab_size, latent_dim) emb_char = tf.sg_emb(name='emb_char', voca_size=Hp.char_vs, dim=Hp.hd, dev=dev) emb_word = tf.sg_emb(name='emb_word', emb=word_emb, voca_size=Hp.word_vs, dim=300, dev=dev) print(chars) ch = chars ch = tf.reverse_sequence(input=ch, seq_lengths=[Hp.c_maxlen] * Hp.batch_size, seq_dim=1) reuse_vars = reuse_vars_enc = True # -------------------------- BYTENET ENCODER -------------------------- with tf.variable_scope('encoder'): # embed table lookup enc = ch.sg_lookup(emb=emb_char) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(Hp.num_blocks): enc = (enc.sg_res_block(size=5, rate=1, name="enc1_%d" % (i), is_first=True, reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars, dev=dev)) byte_enc = enc # -------------------------- QCNN + QPOOL ENCODER #1 -------------------------- with tf.variable_scope('quazi'): #quasi cnn layer ZFO [batch * 3, seqlen, dim2 ] conv = byte_enc.sg_quasi_conv1d(is_enc=True, size=4, name="qconv_1", dev=dev, reuse_vars=reuse_vars) # c = f * c + (1 - f) * z, h = o*c [batch * 4, seqlen, hd] pool0 = conv.sg_quasi_rnn(is_enc=False, att=False, name="qrnn_1", reuse_vars=reuse_vars, dev=dev) qpool_last = pool0[:, -1, :] # -------------------------- MAXPOOL along time dimension -------------------------- inpt_maxpl = tf.expand_dims(byte_enc, 1) # [batch, 1, seqlen, channels] maxpool = tf.nn.max_pool(inpt_maxpl, [1, 1, Hp.c_maxlen, 1], [1, 1, 1, 1], 'VALID') maxpool = tf.squeeze(maxpool, [1, 2]) # -------------------------- HIGHWAY -------------------------- concat = qpool_last + maxpool with tf.variable_scope('highway', reuse=reuse_vars): input_lstm = highway(concat, concat.get_shape()[-1], num_layers=1) # -------------------------- CONTEXT LSTM -------------------------- input_lstm = tf.nn.dropout(input_lstm, Hp.keep_prob) with tf.variable_scope('contx_lstm', reuse=reuse_vars): output, rnn_state = rnn_cell(input_lstm, rnn_state) beam_size = 8 reuse_vars = out_reuse_vars greedy = False if greedy: dec_state = rnn_state dec_out = [] d_out = tf.constant([1] * Hp.batch_size) for idx in range(Hp.w_maxlen): w_input = d_out.sg_lookup(emb=emb_word) dec_state = tf.contrib.rnn.LSTMStateTuple(c=dec_state.c, h=dec_state.h) with tf.variable_scope('dec_lstm', reuse=idx > 0 or reuse_vars): d_out, dec_state = dec_cell(w_input, dec_state) dec_out.append(d_out) d_out = tf.expand_dims(d_out, 1).sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=idx > 0 or reuse_vars) d_out = tf.squeeze(d_out).sg_argmax() dec_out = tf.stack(dec_out, 1) dec = dec_out.sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=True) return dec.sg_argmax(), rnn_state else: # ------------------ BEAM SEARCH -------------------- dec_state = tf.contrib.rnn.LSTMStateTuple( tf.tile(tf.expand_dims(rnn_state[0], 1), [1, beam_size, 1]), tf.tile(tf.expand_dims(rnn_state[1], 1), [1, beam_size, 1])) initial_ids = tf.constant([1] * Hp.batch_size) def symbols_to_logits_fn(ids, dec_state): dec = [] dec_c, dec_h = [], [] # (batch x beam_size x decoded_seq) ids = tf.reshape(ids, [Hp.batch_size, beam_size, -1]) print("dec_state ", dec_state[0].get_shape().as_list()) for ind in range(beam_size): with tf.variable_scope('dec_lstm', reuse=ind > 0 or reuse_vars): w_input = ids[:, ind, -1].sg_lookup(emb=emb_word) dec_state0 = tf.contrib.rnn.LSTMStateTuple( c=dec_state.c[:, ind, :], h=dec_state.h[:, ind, :]) dec_out, dec_state_i = dec_cell(w_input, dec_state0) dec_out = tf.expand_dims(dec_out, 1) dec_i = dec_out.sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=ind > 0 or reuse_vars) dec.append(tf.squeeze(dec_i, 1)) dec_c.append(dec_state_i[0]) dec_h.append(dec_state_i[1]) return tf.stack(dec, 1), tf.contrib.rnn.LSTMStateTuple( tf.stack(dec_c, 1), tf.stack(dec_h, 1)) final_ids, final_probs = beam_search.beam_search(symbols_to_logits_fn, dec_state, initial_ids, beam_size, Hp.w_maxlen - 1, Hp.word_vs, 3.5, eos_id=2) return final_ids[:, 0, :], rnn_state
def rnn_body_stat(time, rnn_state): ch = chars_sent.read(time) ch = tf.reverse_sequence(input=ch, seq_lengths=[Hp.c_maxlen] * Hp.batch_size, seq_dim=1) reuse_vars = out_reuse_vars # -------------------------- BYTENET ENCODER -------------------------- with tf.variable_scope('encoder'): # embed table lookup enc = ch.sg_lookup(emb=emb_char) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(Hp.num_blocks): enc = (enc.sg_res_block(size=5, rate=1, name="enc1_%d" % (i), is_first=True, reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars, dev=dev)) byte_enc = enc # -------------------------- QCNN + QPOOL ENCODER #1 -------------------------- with tf.variable_scope('quazi'): #quasi cnn layer ZFO [batch * 3, seqlen, dim2 ] conv = byte_enc.sg_quasi_conv1d(is_enc=True, size=4, name="qconv_1", dev=dev, reuse_vars=reuse_vars) # c = f * c + (1 - f) * z, h = o*c [batch * 4, seqlen, hd] pool0 = conv.sg_quasi_rnn(is_enc=False, att=False, name="qrnn_1", reuse_vars=reuse_vars, dev=dev) qpool_last = pool0[:, -1, :] # -------------------------- MAXPOOL along time dimension -------------------------- inpt_maxpl = tf.expand_dims(byte_enc, 1) # [batch, 1, seqlen, channels] maxpool = tf.nn.max_pool(inpt_maxpl, [1, 1, Hp.c_maxlen, 1], [1, 1, 1, 1], 'VALID') maxpool = tf.squeeze(maxpool, [1, 2]) # -------------------------- HIGHWAY -------------------------- concat = qpool_last + maxpool with tf.variable_scope('highway', reuse=reuse_vars): input_lstm = highway(concat, concat.get_shape()[-1], num_layers=1) # -------------------------- CONTEXT LSTM -------------------------- input_lstm = tf.nn.dropout(input_lstm, Hp.keep_prob) with tf.variable_scope('contx_lstm', reuse=reuse_vars): output, rnn_state = rnn_cell(input_lstm, rnn_state) return (time + 1, rnn_state)
def tower_infer_enc(chars, scope, rnn_cell, dec_cell, word_emb, out_reuse_vars=False, dev='/cpu:0'): out_rvars = out_reuse_vars # make embedding matrix for source and target with tf.device(dev): with tf.variable_scope('embatch_size', reuse=out_reuse_vars): # (vocab_size, latent_dim) emb_char = tf.sg_emb(name='emb_char', voca_size=Hp.char_vs, dim=Hp.hd, dev=dev) emb_word = tf.sg_emb(name='emb_word', emb=word_emb, voca_size=Hp.word_vs, dim=300, dev=dev) chars = tf.cast(chars, tf.int32) time = tf.constant(0) inputs = tf.transpose(chars, perm=[1, 0, 2]) input_ta = tensor_array_ops.TensorArray(tf.int32, size=tf.shape(chars)[1], dynamic_size=True, clear_after_read=True) chars_sent = input_ta.unstack(inputs) #each element is (batch, sentlen) resp_steps = tf.shape(chars)[1] # number of sentences in paragraph statm_steps = resp_steps // 2 rnn_state = rnn_cell.zero_state( Hp.batch_size, tf.float32) #rnn_cell.rnn_state, rnn_cell.rnn_h maxdecode = 3 # -------------------------------------------- STATEMENT ENCODING ----------------------------------------------- def rnn_cond_stat(time, rnn_state): return tf.less(time, statm_steps - 1) def rnn_body_stat(time, rnn_state): ch = chars_sent.read(time) ch = tf.reverse_sequence(input=ch, seq_lengths=[Hp.c_maxlen] * Hp.batch_size, seq_dim=1) reuse_vars = out_reuse_vars # -------------------------- BYTENET ENCODER -------------------------- with tf.variable_scope('encoder'): # embed table lookup enc = ch.sg_lookup(emb=emb_char) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(Hp.num_blocks): enc = (enc.sg_res_block(size=5, rate=1, name="enc1_%d" % (i), is_first=True, reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars, dev=dev)) byte_enc = enc # -------------------------- QCNN + QPOOL ENCODER #1 -------------------------- with tf.variable_scope('quazi'): #quasi cnn layer ZFO [batch * 3, seqlen, dim2 ] conv = byte_enc.sg_quasi_conv1d(is_enc=True, size=4, name="qconv_1", dev=dev, reuse_vars=reuse_vars) # c = f * c + (1 - f) * z, h = o*c [batch * 4, seqlen, hd] pool0 = conv.sg_quasi_rnn(is_enc=False, att=False, name="qrnn_1", reuse_vars=reuse_vars, dev=dev) qpool_last = pool0[:, -1, :] # -------------------------- MAXPOOL along time dimension -------------------------- inpt_maxpl = tf.expand_dims(byte_enc, 1) # [batch, 1, seqlen, channels] maxpool = tf.nn.max_pool(inpt_maxpl, [1, 1, Hp.c_maxlen, 1], [1, 1, 1, 1], 'VALID') maxpool = tf.squeeze(maxpool, [1, 2]) # -------------------------- HIGHWAY -------------------------- concat = qpool_last + maxpool with tf.variable_scope('highway', reuse=reuse_vars): input_lstm = highway(concat, concat.get_shape()[-1], num_layers=1) # -------------------------- CONTEXT LSTM -------------------------- input_lstm = tf.nn.dropout(input_lstm, Hp.keep_prob) with tf.variable_scope('contx_lstm', reuse=reuse_vars): output, rnn_state = rnn_cell(input_lstm, rnn_state) return (time + 1, rnn_state) loop_vars_stat = [time, rnn_state] time, rnn_state = tf.while_loop\ (rnn_cond_stat, rnn_body_stat, loop_vars_stat, swap_memory=False) return rnn_state
def rnn_body(time, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h, losses): x = x_sent.read(time) y = x_sent.read(time + 1) # (batch, sentlen) = (16, 200) # shift target by one step for training source y_src = tf.concat([tf.zeros((Hp.batch_size, 1), tf.int32), y[:, :-1]], 1) reuse_vars = time == tf.constant(0) or reu_vars # -------------------------- BYTENET ENCODER -------------------------- # embed table lookup enc = x.sg_lookup(emb=emb_x) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(num_blocks): enc = (enc.sg_res_block( size=5, rate=1, name="enc1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars)) # -------------------------- QCNN + QPOOL ENCODER with attention #1 -------------------------- #quasi cnn layer ZFO [batch * 3, t, dim2 ] conv = enc.sg_quasi_conv1d(is_enc=True, size=3, name="qconv_1", reuse_vars=reuse_vars) #attention layer # recurrent layer # 1 + final encoder hidden state subrec1 = tf.tile((subrec1.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) concat = conv.sg_concat(target=subrec1, axis=0) # (batch*4, sentlen, latentdim) pool = concat.sg_quasi_rnn(is_enc=True, att=True, name="qrnn_1", reuse_vars=reuse_vars) subrec1 = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- QCNN + QPOOL ENCODER with attention #2 -------------------------- # quazi cnn ZFO (batch*3, sentlen, latentdim) conv = pool.sg_quasi_conv1d(is_enc=True, size=2, name="qconv_2", reuse_vars=reuse_vars) # (batch, sentlen-duplicated, latentdim) subrec2 = tf.tile((subrec2.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) # (batch*4, sentlen, latentdim) concat = conv.sg_concat(target=subrec2, axis=0) pool = concat.sg_quasi_rnn(is_enc=True, att=True, name="qrnn_2", reuse_vars=reuse_vars) subrec2 = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- ConvLSTM with RESIDUAL connection and MULTIPLICATIVE block -------------------------- #residual block causal = False # for encoder crnn_input = (pool[:Hp.batch_size, :, :].sg_bypass_gpus( name='relu_0', act='relu', bn=(not causal), ln=causal).sg_conv1d_gpus(name="dimred_0", size=1, dev="/cpu:0", reuse=reuse_vars, dim=Hp.hd / 2, act='relu', bn=(not causal), ln=causal)) # conv LSTM with tf.variable_scope("mem/clstm") as scp: (crnn_state, crnn_h) = crnn_cell(crnn_input, (crnn_state, crnn_h), size=5, reuse_vars=reuse_vars) # dimension recover and residual connection rnn_input0 = pool[:Hp.batch_size,:,:] + crnn_h\ .sg_conv1d_gpus(name = "diminc_0",size=1,dev="/cpu:0", dim=Hp.hd,reuse=reuse_vars, act='relu', bn=(not causal), ln=causal) # -------------------------- QCNN + QPOOL ENCODER with attention #3 -------------------------- # pooling for lstm input # quazi cnn ZFO (batch*3, sentlen, latentdim) conv = rnn_input0.sg_quasi_conv1d(is_enc=True, size=2, name="qconv_3", reuse_vars=reuse_vars) pool = conv.sg_quasi_rnn(is_enc=True, att=False, name="qrnn_3", reuse_vars=reuse_vars) rnn_input = pool[:Hp.batch_size, -1, :] # last character in sequence # -------------------------- LSTM with RESIDUAL connection and MULTIPLICATIVE block -------------------------- # recurrent block with tf.variable_scope("mem/lstm") as scp: (rnn_state, rnn_h) = rnn_cell(rnn_input, (rnn_state, rnn_h)) rnn_h2 = tf.tile(((rnn_h + rnn_input).sg_expand_dims(axis=1)), [1, Hp.maxlen, 1]) # -------------------------- BYTENET DECODER -------------------------- # CNN decoder dec = y_src.sg_lookup(emb=emb_y).sg_concat(target=rnn_h2, name="dec") for i in range(num_blocks): dec = (dec.sg_res_block( size=3, rate=1, causal=True, name="dec1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=2, causal=True, name="dec2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=4, causal=True, name="dec4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=8, causal=True, name="dec8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=16, causal=True, name="dec16_%d" % (i), reuse_vars=reuse_vars)) # final fully convolution layer for softmax dec = dec.sg_conv1d_gpus(size=1, dim=Hp.vs, name="out", summary=False, dev=self._dev, reuse=reuse_vars) ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example") cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy') losses = tf.add_n([losses, cross_entropy_mean], name='total_loss') return (time + 1, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h, losses)
def tower_loss2_old(xx, scope, reuse_vars=False): # make embedding matrix for source and target with tf.variable_scope('embs', reuse=reuse_vars): emb_x = tf.sg_emb(name='emb_x', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) emb_y = tf.sg_emb(name='emb_y', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) x_sents = tf.unstack(xx, axis=1) #each element is (batch, sentlen) # generate first an unconditioned sentence n_input = Hp.hd subrec1 = subrec_zero_state(Hp.bs, Hp.hd) subrec2 = subrec_zero_state(Hp.bs, Hp.hd) rnn_cell = LSTMCell(in_dim=n_input, dim=Hp.hd) (rnn_state, rnn_h) = rnn_cell.zero_state(Hp.bs) crnn_cell = ConvLSTMCell(in_dim=n_input, dim=Hp.hd) (crnn_state, crnn_h) = crnn_cell.zero_state(n_input) for sent in range(len(x_sents) - 1): y = x_sents[i + 1] x = x_sents[i] # (batch, sentlen) = (16, 200) # shift target by one step for training source y_src = tf.concat([tf.zeros((Hp.bs, 1), tf.sg_intx), y[:, :-1]], 1) # embed table lookup enc = x.sg_lookup(emb=emb_x) #(batch, sentlen, dim1) # loop dilated conv block for i in range(num_blocks): enc = (enc.sg_res_block( size=5, rate=1, name="enc1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars)) #quasi rnn layer [batch * 3, t, dim2 ] conv = enc.sg_quasi_conv1d(is_enc=True, size=2, name="conv1", reuse_vars=reuse_vars) #attention layer # recurrent layer # 1 + final encoder hidden state concat = subrec1.sg_concat(target=conv, dim=0) subrec1 = conv.sg_quasi_rnn(is_enc=True, att=True) conv = pool.sg_quasi_conv1d(is_enc=True, size=2, name="conv2", reuse_vars=reuse_vars) concat = subrec2.sg_concat(target=conv, dim=0) subrec2 = conv.sg_quasi_rnn(is_enc=True, att=True) # conv LSTM (crnn_state, crnn_h) = crnn_cell(subrec2, (crnn_state, crnn_h), 5) # recurrent block (rnn_state, rnn_h) = rnn_cell(crnn_h, (rnn_state, rnn_h)) # CNN decoder dec = crnn_h.sg_concat(target=y_src.sg_lookup(emb=emb_y), name="dec") for i in range(num_blocks): dec = (dec.sg_res_block( size=3, rate=1, causal=True, name="dec1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=2, causal=True, name="dec2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=4, causal=True, name="dec4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=8, causal=True, name="dec8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=16, causal=True, name="dec16_%d" % (i), reuse_vars=reuse_vars)) # final fully convolution layer for softmax dec = dec.sg_conv1d_gpus(size=1, dim=Hp.vs,name="out",summary=False,\ dev = self._dev,reuse=reuse_vars) ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example") cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') return total_loss
def __init__(self, x, y, num_batch, vocab_size, emb_dim, hidden_dim, max_ep=240, infer_shape=(1, 1), mode="train"): self.num_batch = num_batch self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.vocab_size = vocab_size self.max_len_infer = 512 self.max_ep = max_ep # reuse = len([t for t in tf.global_variables() if t.name.startswith('gen')]) > 0 reuse = (mode == 'infer') if mode == "train": self.x = x self.y = y elif mode == "infer": self.x = tf.placeholder(tf.int32, shape=infer_shape) self.y = tf.placeholder(tf.int32, shape=infer_shape) with tf.variable_scope("gen_embs", reuse=reuse): self.emb_x = tf.get_variable("emb_x", [self.vocab_size, self.emb_dim]) self.emb_y = tf.get_variable("emb_y", [self.vocab_size, self.emb_dim]) self.X = tf.nn.embedding_lookup(self.emb_x, self.x) self.Y = tf.nn.embedding_lookup(self.emb_y, self.y) with tf.sg_context(name='gen', reuse=reuse): # self.emb_x = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_dim], 0.0, 1.0), name="emb_x") # self.emb_y = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_dim], 0.0, 1.0), name="emb_y") # self.emb_x = tf.sg_emb(name='emb_x', voca_size=self.vocab_size, dim=self.emb_dim) # (68,16) # self.emb_y = tf.sg_emb(name='emb_y', voca_size=self.vocab_size, dim=self.emb_dim) # (68,16) # self.X = self.x.sg_lookup(emb=self.emb_x) # (8,63,16) # self.Y = self.y.sg_lookup(emb=self.emb_y) # (8,63,16) if mode == "train": self.lstm_layer = self.X.sg_lstm(in_dim=self.emb_dim, dim=self.vocab_size, name="lstm") # (8, 63, 68) self.test = self.lstm_layer.sg_softmax(name="testtt") print "mazum??" print self.test elif mode == "infer": self.lstm_layer = self.X.sg_lstm(in_dim=self.emb_dim, dim=self.vocab_size, last_only=True, name="lstm") self.log_prob = tf.log(self.lstm_layer) # next_token: select by distribution probability, preds: select by argmax self.multinormed = tf.multinomial(self.log_prob, 1) self.next_token = tf.cast( tf.reshape(tf.multinomial(self.log_prob, 1), [1, infer_shape[0]]), tf.int32) self.preds = self.lstm_layer.sg_argmax() if mode == "train": self.loss = self.lstm_layer.sg_ce(target=self.y) self.istarget = tf.not_equal(self.y, 0).sg_float() self.reduced_loss = (self.loss.sg_sum()) / ( self.istarget.sg_sum() + 0.0000001) tf.sg_summary_loss(self.reduced_loss, "reduced_loss")
def wrapper(tensor, **kwargs): r"""Manages arguments of `tf.sg_opt`. Args: tensor: A `tensor` (automatically passed by decorator). kwargs: shape: A list of integers. The shape of `tensor`. Inferred if not specified. in_dim: An integer. The size of input dimension, which is set to the last one by default. dim: An integer. The size of output dimension. Has the same value as in_dim by default. bn: Boolean. If True, batch normalization is applied. ln: Boolean. If True, layer normalization is applied. scale: If true, multiple by a trainable gamma variable. When the activation is linear (relu included), this can be disabled because it can be implicitly learned by the next layer. The default is True. dout: A float of range [0, 100). A dropout rate. Set to 0 by default. bias: Boolean. If True, biases are added. As a default, it is set to True name: A name for the layer. As a default, the function name is assigned. act: A name of activation function. e.g., `sigmoid`, `tanh`, etc. reuse: `True` or `None`; if `True`, we go into reuse mode for this `layer` scope as well as all sub-scopes; if `None`, we just inherit the parent scope reuse. regularizer: A string. None, 'l1' or 'l2'. The default is None summary: If True, summaries are added. The default is True. """ from . import sg_initializer as init from . import sg_activation # kwargs parsing opt = tf.sg_opt(kwargs) + sg_get_context() # set default argument try: shape = tensor.get_shape().as_list() # batch normalization off, layer normalization off, dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], bn=False, ln=False, dout=0, summary=True, scale=True) if opt.regularizer == 'l1': opt.regularizer = lambda x: tf.reduce_mean(tf.abs(x)) elif opt.regularizer == 'l2': opt.regularizer = lambda x: tf.square( tf.reduce_mean(tf.square(x))) else: opt.regularizer = None assert not ( opt.bn and opt.ln ), 'one of batch normalization and layer normalization is available.' # disable bias when normalization on opt += tf.sg_opt(bias=not (opt.bn or opt.ln)) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.global_variables(): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) with tf.variable_scope(opt.name, reuse=opt.reuse) as scope: # call layer function out = func(tensor, opt) out_shape = out.get_shape() # apply batch normalization if opt.bn: beta = init.constant('beta', opt.dim, summary=opt.summary) gamma = init.constant('gamma', opt.dim, value=1, summary=opt.summary, trainable=opt.scale) # offset, scale parameter ( for inference ) mean_running = init.constant('mean', opt.dim, trainable=False, summary=opt.summary) variance_running = init.constant('variance', opt.dim, value=1, trainable=False, summary=opt.summary) # use fused batch norm if ndims in [2, 3, 4] if out_shape.ndims in [2, 3, 4]: # add HW dims if necessary, fused_batch_norm requires shape to be NHWC if out_shape.ndims == 2: out = tf.expand_dims(out, axis=1) out = tf.expand_dims(out, axis=2) elif out_shape.ndims == 3: out = tf.expand_dims(out, axis=2) fused_eps = tf.sg_eps if tf.sg_eps > 1e-5 else 1e-5 out, mean, variance = tf.cond( _phase, lambda: tf.nn.fused_batch_norm( out, gamma, beta, epsilon=fused_eps), lambda: tf.nn.fused_batch_norm(out, gamma, beta, mean=mean_running, variance= variance_running, epsilon=fused_eps, is_training=False), ) # restore original shape if HW dims was added if out_shape.ndims == 2: out = tf.squeeze(out, axis=[1, 2]) elif out_shape.ndims == 3: out = tf.squeeze(out, axis=2) # fallback to naive batch norm else: mean, variance = tf.nn.moments( out, axes=list(range(len(out.get_shape()) - 1))) out = tf.cond( _phase, lambda: tf.nn.batch_normalization( out, mean, variance, beta, gamma, tf.sg_eps), lambda: tf.nn.batch_normalization( out, mean_running, variance_running, beta, gamma, tf.sg_eps)) decay = 0.99 tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, mean_running.assign(mean_running * decay + mean * (1 - decay))) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, variance_running.assign(variance_running * decay + variance * (1 - decay))) # apply layer normalization if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim, summary=opt.summary) if opt.scale: gamma = init.constant('gamma', opt.dim, value=1, summary=opt.summary) # calc layer mean, variance for final axis mean, variance = tf.nn.moments(out, axes=[len(out.get_shape()) - 1], keep_dims=True) # apply normalization out = (out - mean) / tf.sqrt(variance + tf.sg_eps) # apply parameter if opt.scale: out = gamma * out + beta else: out = out + beta # apply activation if opt.act: out = getattr(sg_activation, 'sg_' + opt.act.lower())(out) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if opt.summary: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + sg_get_context(), prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def wrapper(tensor, **kwargs): import sg_initializer as init import sg_activation # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # batch normalization off, layer normalization off, dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], bn=False, ln=False, dout=0) assert not ( opt.bn and opt.ln ), 'one of batch normalization and layer normalization is available.' # disable bias when normalization on opt += tf.sg_opt(bias=not (opt.bn or opt.ln)) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.get_collection(tf.GraphKeys.VARIABLES): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + 'layers/' + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'layers/' prefix with tf.variable_scope('layers', reuse=opt.reuse): with tf.variable_scope(opt.name): # call layer function out = func(tensor, opt) # apply batch normalization if opt.bn: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # offset, scale parameter mean_running = init.constant('mean', opt.dim) variance_running = init.constant('variance', opt.dim, value=1) # calc batch mean, variance mean, variance = tf.nn.moments( out, axes=range(len(out.get_shape()) - 1)) # update running mean, variance def update_running_stat(): decay = 0.99 update_op = [ mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay)) ] with tf.control_dependencies(update_op): return tf.identity(mean), tf.identity(variance) # select mean, variance by training phase m, v = tf.cond( _phase, update_running_stat, # updated running stat and batch mean, variance lambda: (mean_running, variance_running) ) # saved mean, variance # apply batch normalization out = tf.nn.batch_normalization(out, m, v, beta, gamma, tf.sg_eps) # apply normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # calc layer mean, variance for final axis mean, variance = tf.nn.moments( out, axes=[len(out.get_shape()) - 1], keep_dims=True) # apply normalization out = (out - mean) / tf.sqrt(variance + tf.sg_eps) # apply parameter out = gamma * out + beta # apply activation if opt.act: out = getattr(sg_activation, 'sg_' + opt.act.lower())(out) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if opt.reuse is None or not opt.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out