def build(self, inputs_shape): B = self._block_size if inputs_shape[1].value is None: raise ValueError( "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value h_depth = self._num_units self._kernel = self.add_variable(_WEIGHTS_VARIABLE_NAME, shape=[(input_depth + h_depth), 4 * self._num_units]) self._bias = self.add_variable( _BIAS_VARIABLE_NAME, shape=[4 * self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) if self.quant == "binary": self._kernel = binarize(self._kernel) self._bias = binarize(self._bias) elif self.quant == "ternary": self._kernel = ternarize(self._kernel) self._bias = ternarize(self._bias) elif self.quant == "bit": self._kernel = bit_utils.quantize_w(self._kernel, self._w_bit) self.built = True
def replace_w(x): # if x.op.name.endswith('Matrix'): if x.op.name.endswith('W'): print("\nKERNEL Before quantize name: " + x.op.name) return bit_utils.quantize_w(tf.tanh(x), bit=conf.w_bit) elif x.op.name.endswith('b'): print("\nbias Before round name: " + x.op.name) # tf.summary.histogram(x.name, x) return x return bit_utils.round_bit_whist(x, bit=conf.w_bit) else: print("\nNOT Quantizing:" + x.op.name) tf.summary.histogram(x.name, x) return x
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) if 'cell_type' not in dir(config) or config.cell_type == 'gru': cell = BitGRUCell(size, w_bit=config.w_bit, f_bit=config.f_bit) elif config.cell_type == 'lstm': cell = BitLSTMCell(size, w_bit=config.w_bit, f_bit=config.f_bit) if is_training and config.keep_prob < 1: cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=config.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell([cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) self._initial_state = bit_utils.round_bit(tf.sigmoid( self._initial_state), bit=config.f_bit) embedding = tf.get_variable( "embedding", [vocab_size, size], initializer=tf.random_uniform_initializer()) inputs = tf.nn.embedding_lookup(embedding, self._input_data) inputs = bit_utils.round_bit(tf.nn.relu(inputs), bit=config.f_bit) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) with bit_utils.replace_variable( lambda x: bit_utils.quantize_w(tf.tanh(x), bit=config.w_bit)): softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def replace_w(x): if x.op.name.endswith('kernel'): return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) else: return x