def __call__(self, inputs, state, scope=None): def replace_w(x): if x.op.name.endswith('Matrix'): return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) else: return x with bit_utils.replace_variable(replace_w): with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope("Gates"): r, u = tf.split( 1, 2, tf.nn.rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = tf.sigmoid(r), tf.sigmoid(u) with tf.variable_scope("Candidate"): c = self._activation( tf.nn.rnn_cell._linear([ inputs, bit_utils.round_bit(r * state, bit=self._f_bit) ], self._num_units, True)) c = bit_utils.round_bit(c, bit=self._f_bit) new_h = bit_utils.round_bit(u * state + (1 - u) * c, bit=self._f_bit) return new_h, new_h
def call(self, inputs, state): def replace_w(x): if x.op.name.endswith('kernel'): return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) else: return x with bit_utils.replace_variable(replace_w): sigmoid = tf.sigmoid # Parameters of gates are concatenated into one multiply for # efficiency. if self._state_is_tuple: c, h = state else: c, h = tf.split(value=state, num_or_size_splits=2, axis=1) if self._linear is None: # self._linear = rnn_cell_impl._Linear( self._linear = core_rnn_cell._Linear( [inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split( value=self._linear([inputs, h]), num_or_size_splits=4, axis=1) new_c = ( c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) new_h = bit_utils.round_bit(self._activation( new_c) * sigmoid(o), bit=self._f_bit) if self._state_is_tuple: new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) else: new_state = tf.concat([new_c, new_h], 1) return new_h, new_state
def __call__(self, inputs, state, scope=None): def replace_w(x): if x.op.name.endswith('Matrix'): return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) else: return x with bit_utils.replace_variable(replace_w): with tf.variable_scope(scope or type(self).__name__): if self._state_is_tuple: c, h = state else: c, h = tf.split(1, 2, state) concat = tf.nn.rnn_cell._linear([inputs, h], 4 * self._num_units, True) i, j, f, o = tf.split(1, 4, concat) new_c = (c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * self._activation(j)) new_h = bit_utils.round_bit(self._activation(new_c) * tf.sigmoid(o), bit=self._f_bit) if self._state_is_tuple: new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) else: new_state = tf.concat(1, [new_c, new_h]) return new_h, new_state
def call(self, inputs, state): """Long short-term memory cell (LSTM). Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ B = self._block_size # print('state_size') # print(state.get_shape().as_list()) sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) #gate_inputs = math_ops.matmul( # array_ops.concat([inputs, h], 1), self._kernel) gate_inputs = BH_dense(inputs, 4 * self._num_units, B, self.transform, kernel_weights=self._kernel) # gate_inputs = BH_matmul( # array_ops.concat([inputs, h], 1), self._kernel, B, "Fourier") gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one) forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) # Note that using `add` and `multiply` instead of `+` and `*` gives a # performance improvement. So using those at the cost of readability. add = math_ops.add multiply = math_ops.multiply #multiply = Circ_matmul() new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) new_h = multiply(self._activation(new_c), sigmoid(o)) new_h = bit_utils.round_bit(new_h, self._f_bit) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def reset_lstm_state(self): conf = Config() s = self.state z = tf.zeros_like(s[0].c) print("\n==> Zeroing state\n") z = bit_utils.round_bit(tf.sigmoid(z), bit=conf.f_bit) # print("\nResetting state\n") return tf.group(s[0].c.assign(z), s[0].h.assign(z), s[1].c.assign(z), s[1].h.assign(z), name='reset_lstm_state')
def call(self, inputs, state): def replace_w(x): if x.op.name.endswith('kernel'): return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) else: return x with bit_utils.replace_variable(replace_w): if self._gate_linear is None: bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = tf.constant_initializer( 1.0, dtype=inputs.dtype) with tf.variable_scope("gates"): # Reset gate and update gate. # self._gate_linear = rnn_cell_impl._Linear( self._gate_linear = core_rnn_cell._Linear( [inputs, state], 2 * self._num_units, True, bias_initializer=bias_ones, kernel_initializer=self._kernel_initializer) value = tf.sigmoid(self._gate_linear([inputs, state])) r, u = tf.split(value=value, num_or_size_splits=2, axis=1) r_state = bit_utils.round_bit(r * state, bit=self._f_bit) if self._candidate_linear is None: with tf.variable_scope("candidate"): # self._candidate_linear = rnn_cell_impl._Linear( self._candidate_linear = core_rnn_cell._Linear( [inputs, r_state], self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) c = self._activation(self._candidate_linear([inputs, r_state])) c = bit_utils.round_bit(c, bit=self._f_bit) new_h = bit_utils.round_bit( u * state + (1 - u) * c, bit=self._f_bit) return new_h, new_h
def _build_graph(self, inputs): conf = Config() is_training = get_current_tower_context().is_training input, nextinput = inputs initializer = tf.uniform_unit_scaling_initializer() # initializer = tf.random_uniform_initializer(-0.05, 0.05) def get_basic_cell(): # cell = rnn.BasicLSTMCell(num_units=conf.hidden_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse) cell = bit_rnn.BitLSTMCell( num_units=conf.hidden_size, w_bit=conf.w_bit, f_bit=conf.f_bit, #) forget_bias=0.0, reuse=tf.get_variable_scope().reuse) if is_training and conf.keep_prob < 1: cell = bit_rnn.DropoutWrapper(cell, output_keep_prob=conf.keep_prob) return cell cell = rnn.MultiRNNCell( [get_basic_cell() for _ in range(conf.num_layers)]) def get_v(n): return tf.get_variable( n, [conf.batch_size, conf.hidden_size], #,[BATCH, HIDDEN_SIZE], trainable=False, initializer=tf.constant_initializer()) self.state = state_var = \ (rnn.LSTMStateTuple(get_v('c0'), get_v('h0')), rnn.LSTMStateTuple(get_v('c1'), get_v('h1'))) embeddingW = tf.get_variable( 'embedding', [conf.vocab_size, conf.hidden_size], initializer=initializer) #tf.random_uniform_initializer) input_feature = tf.nn.embedding_lookup( embeddingW, input) # B x seqlen x hiddensize print("\n-> Input Rounding") input_feature = bit_utils.round_bit(tf.nn.relu(input_feature), bit=conf.f_bit) if is_training and conf.keep_prob < 1: input_feature = Dropout(input_feature, conf.keep_prob) # print("\n\nThe STATE:") # print(self.state) with tf.variable_scope('LSTM', initializer=initializer): input_list = tf.unstack(input_feature, num=conf.num_steps, axis=1) # seqlen x (Bxhidden) outputs, last_state = rnn.static_rnn(cell, input_list, state_var, scope='rnn') update_state_ops = [] for k in range(conf.num_layers): update_state_ops.extend([ tf.assign(state_var[k].c, last_state[k].c), tf.assign(state_var[k].h, last_state[k].h) ]) def replace_w(x): # if x.op.name.endswith('Matrix'): if x.op.name.endswith('W'): print("\nKERNEL Before quantize name: " + x.op.name) return bit_utils.quantize_w(tf.tanh(x), bit=conf.w_bit) elif x.op.name.endswith('b'): print("\nbias Before round name: " + x.op.name) # tf.summary.histogram(x.name, x) return x return bit_utils.round_bit_whist(x, bit=conf.w_bit) else: print("\nNOT Quantizing:" + x.op.name) tf.summary.histogram(x.name, x) return x # seqlen x (Bxrnnsize) output = tf.reshape(tf.concat(outputs, 1), [-1, conf.hidden_size]) # (Bxseqlen) x hidden with bit_utils.replace_variable(replace_w): # lambda x: bit_utils.quantize_w(tf.tanh(x), bit=conf.w_bit)): logits = FullyConnected('fc', output, conf.vocab_size, nl=tf.identity, W_init=initializer, b_init=initializer) # logits = FullyConnected('fc', output, conf.vocab_size, nl=tf.identity, W_init=initializer, b_init=initializer) xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.reshape(nextinput, [-1])) with tf.control_dependencies(update_state_ops): self.cost = tf.truediv(tf.reduce_sum(xent_loss), tf.cast(conf.batch_size, tf.float32), name='cost') # log-perplexity perpl = tf.exp(self.cost / conf.num_steps, name='perplexity') summary.add_moving_summary(perpl, self.cost)
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) if 'cell_type' not in dir(config) or config.cell_type == 'gru': cell = BitGRUCell(size, w_bit=config.w_bit, f_bit=config.f_bit) elif config.cell_type == 'lstm': cell = BitLSTMCell(size, w_bit=config.w_bit, f_bit=config.f_bit) if is_training and config.keep_prob < 1: cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=config.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell([cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) self._initial_state = bit_utils.round_bit(tf.sigmoid( self._initial_state), bit=config.f_bit) embedding = tf.get_variable( "embedding", [vocab_size, size], initializer=tf.random_uniform_initializer()) inputs = tf.nn.embedding_lookup(embedding, self._input_data) inputs = bit_utils.round_bit(tf.nn.relu(inputs), bit=config.f_bit) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) with bit_utils.replace_variable( lambda x: bit_utils.quantize_w(tf.tanh(x), bit=config.w_bit)): softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))