def call(self, inputs, state): sigmoid = math_ops.sigmoid # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) # get context from encoder outputs context = self._simple_attention(self._encoder_vector, self._encoder_proj, h) if self._linear is None: self._linear = _Linear([inputs, context, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split( value=self._linear([inputs, context, h]), num_or_size_splits=4, axis=1) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def get_model_params(variable_prefix, split_lstm_matrices=True): if variable_prefix: exclude = [ variable_prefix+"/Variable", variable_prefix+"/Variable_1" ] tmp = { v.op.name: v.eval() for v in tf.global_variables() if (v.op.name.startswith(variable_prefix) and v.op.name not in exclude) } else: exclude = [ "Variable", "Variable_1" ] tmp = { v.op.name: v.eval() for v in tf.global_variables() if v.op.name not in exclude } # Rename keys params = {name.replace("/", "-"): param for name, param in tmp.items()} if split_lstm_matrices: for name in params.keys(): if "LSTMCell" in name: # i = input_gate, j = new_input, f = forget_gate, o = output_gate if "Matrix" in name: i, j, f, o = array_ops.split(1, 4, params[name]) elif "Bias" in name: i, j, f, o = array_ops.split(0, 4, params[name]) else: logging.error("Unknown tensor type..") exit(1) name_i = name.replace("LSTMCell", "LSTMCell-i") name_j = name.replace("LSTMCell", "LSTMCell-j") name_f = name.replace("LSTMCell", "LSTMCell-f") name_o = name.replace("LSTMCell", "LSTMCell-o") params[name_i] = i.eval() params[name_j] = j.eval() params[name_f] = f.eval() params[name_o] = o.eval() del params[name] elif "AttnV" in name: params[name] = array_ops.reshape(params[name], [ params[name].shape[0], 1 ]).eval() elif "AttnW" in name: # remove dims of size 1 params[name] = tf.squeeze(params[name]).eval() return params
def _ragged_split(tensor, pieces): """Like split for 1D tensors but allows case where len % pieces != 0. Args: tensor: T `tf.Tensor` that must be 1D. pieces: a positive integer specifying the number of pieces into which tensor should be split. Returns: list of T `tf.Tensor` of length pieces, which hold the values of the input tensor, in order. The final tensor may be shorter than the others, which will all be of equal length. Raises: ValueError: input tensor must be 1D. """ shape = tensor.shape if 1 != len(shape): raise ValueError("input tensor must be 1D") tensor_len = shape.dims[0].value chunk_size = tensor_len // pieces with ops.colocate_with(tensor): if tensor_len != (pieces * chunk_size): # last piece will be short assert pieces > 1 last_chunk_size = tensor_len - ((pieces - 1) * chunk_size) assert last_chunk_size > 0 piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size] return array_ops.split(tensor, piece_lens) else: return array_ops.split(tensor, pieces)
def _split_batch(features, labels, number_of_shards, device): """Split input features and labes into batches.""" def split_dictionary(dictionary): """Split a dictionary into shards.""" shards = [{} for _ in range(number_of_shards)] for name, tensor in six.iteritems(dictionary): if isinstance(tensor, sparse_tensor.SparseTensor): for i, shard in enumerate( sparse_ops.sparse_split( sp_input=tensor, num_split=number_of_shards, axis=0)): shards[i][name] = shard else: for i, shard in enumerate(array_ops.split(tensor, number_of_shards)): shards[i][name] = shard return shards with ops_lib.name_scope('split_inputs'): with ops_lib.device(device): if isinstance(features, dict): feature_shards = split_dictionary(features) else: feature_shards = array_ops.split(features, number_of_shards) if labels is None: label_shards = None elif isinstance(labels, dict): label_shards = split_dictionary(labels) else: label_shards = array_ops.split(labels, number_of_shards) return feature_shards, label_shards
def testZerosCacheDoesntLeakAcrossModes(self): with ops.Graph().as_default(): t = random_ops.random_normal(shape=[100, 2]) x = random_ops.random_normal(shape=[100, 4]) dy = random_ops.random_normal(shape=[100, 4]) with backprop.GradientTape() as gradient_tape: gradient_tape.watch(x) x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1) y1 = x1 ** 2. y = array_ops.concat([y1, t], axis=1) dx = gradient_tape.gradient(y, x, output_gradients=dy) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(dx) t = random_ops.random_normal(shape=[100, 2]) x = random_ops.random_normal(shape=[100, 4]) dy = random_ops.random_normal(shape=[100, 4]) with backprop.GradientTape() as gradient_tape: gradient_tape.watch(x) x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1) y1 = x1 ** 2. y = array_ops.concat([y1, t], axis=1) dx = gradient_tape.gradient(y, x, output_gradients=dy)
def testSplit(self): for dtype in self.numeric_types: for axis in [0, -3]: self._testBinary( lambda x, y: array_ops.split(value=y, num_or_size_splits=3, axis=x), np.int32(axis), np.array([[[1], [2]], [[3], [4]], [[5], [6]]], dtype=dtype), expected=[ np.array([[[1], [2]]], dtype=dtype), np.array([[[3], [4]]], dtype=dtype), np.array([[[5], [6]]], dtype=dtype), ], equality_test=self.ListsAreClose) for axis in [1, -2]: self._testBinary( lambda x, y: array_ops.split(value=y, num_or_size_splits=2, axis=x), np.int32(axis), np.array([[[1], [2]], [[3], [4]], [[5], [6]]], dtype=dtype), expected=[ np.array([[[1]], [[3]], [[5]]], dtype=dtype), np.array([[[2]], [[4]], [[6]]], dtype=dtype), ], equality_test=self.ListsAreClose)
def cluster_feature_analysis(sess, user_ids): # Get trained parameters lstm_vars = [v for v in tf.all_variables() if v.name.startswith('lstm')] matrix_var = sess.run(lstm_vars[0]) bias_var = sess.run(lstm_vars[1]) # Split the gates matrix_i, matrix_j, matrix_f, matrix_o = sess.run(array_ops.split(1, 4, matrix_var)) bias_i, bias_j, bias_f, bias_o = sess.run(array_ops.split(0, 4, bias_var)) dict_i, dict_j, dict_f, dict_o = dict(), dict(), dict(), dict() for feature in range(len(config.feature_desc)): dict_i[feature] = [] dict_j[feature] = [] dict_f[feature] = [] dict_o[feature] = [] for user_id in user_ids: print user_id gates_i, gates_j, gates_f, gates_o = feature_importance(sess, user_id, matrix_i, matrix_j, matrix_f, matrix_o, bias_i, bias_j, bias_f, bias_o) for feature in range(len(config.feature_desc)): dict_i[feature].append(gates_i[feature]) dict_j[feature].append(gates_j[feature]) dict_f[feature].append(gates_f[feature]) dict_o[feature].append(gates_o[feature]) return dict_i, dict_j, dict_f, dict_o
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) concat = _linear([inputs, h], 4 * self._num_units, True, 0., self.weights_init, self.trainable, self.restore, self.reuse) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = (c * self._inner_activation(f + self._forget_bias) + self._inner_activation(i) * self._activation(j)) new_h = self._activation(new_c) * self._inner_activation(o) if self._state_is_tuple: new_state = _rnn_cell.LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) # Retrieve RNN Variables with tf.variable_scope('Linear', reuse=True): self.W = tf.get_variable('Matrix') self.b = tf.get_variable('Bias') return new_h, new_state
def call(self, inputs, states, training=None): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state # dropout matrices for input units dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4) # dropout matrices for recurrent units rec_dp_mask = self.get_recurrent_dropout_mask_for_cell( h_tm1, training, count=4) if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 (kernel_i, kernel_f, kernel_c, kernel_o) = array_ops.split(self.kernel, 4, axis=3) (recurrent_kernel_i, recurrent_kernel_f, recurrent_kernel_c, recurrent_kernel_o) = array_ops.split(self.recurrent_kernel, 4, axis=3) if self.use_bias: bias_i, bias_f, bias_c, bias_o = array_ops.split(self.bias, 4) else: bias_i, bias_f, bias_c, bias_o = None, None, None, None x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding) x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding) x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding) x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding) h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i) h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f) h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c) h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o) i = self.recurrent_activation(x_i + h_i) f = self.recurrent_activation(x_f + h_f) c = f * c_tm1 + i * self.activation(x_c + h_c) o = self.recurrent_activation(x_o + h_o) h = o * self.activation(c) return h, [h, c]
def _tf_to_cudnn_biases(self, *tf_biases): r"""Reverse the operations in StitchBiases().""" # b_ir is the summed bias of reset and update gate. b_ir, b_wh, b_rh = tf_biases bi, br = b_ir * 0.5, b_ir * 0.5 b_wi, b_wr = array_ops.split(bi, 2, axis=0) b_ri, b_rr = array_ops.split(br, 2, axis=0) return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
def testVariableShapeFunction(self): # size_splits too big with self.assertRaises(ValueError): array_ops.split([0, 1], [3, -1], axis=0) # Correct inference of variable dimension s0, s1 = array_ops.split([0, 1, 2], [2, -1], axis=0) assert s0.shape.as_list() == [2] assert s1.shape.as_list() == [1]
def testInvalidNumOutputs(self): with self.assertRaisesRegexp( Exception, "Value for attr 'num_split' of -1 must be at least minimum 1"): array_ops.split(value=[1, 2, 3], num_or_size_splits=-1) with self.assertRaisesRegexp( Exception, "Value for attr 'num_split' of 0 must be at least minimum 1"): array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
def _testSpecialCasesVariable(self): inp = np.random.rand(4, 4).astype("f") with test_util.device(use_gpu=True): result = self.evaluate(array_ops.split(inp, [4], 0)) self.assertAllEqual(result[0], inp) result = self.evaluate(array_ops.split(inp, [-1, 3], 0)) self.assertAllEqual(result[0], inp[0:1, :]) self.assertAllEqual(result[1], inp[1:4, :])
def _testSpecialCasesVariable(self, use_gpu): inp = np.random.rand(4, 4).astype("f") with self.test_session(use_gpu=use_gpu) as sess: result = sess.run(array_ops.split(inp, [4], 0)) self.assertAllEqual(result[0], inp) result = sess.run(array_ops.split(inp, [-1, 3], 0)) self.assertAllEqual(result[0], inp[0:1, :]) self.assertAllEqual(result[1], inp[1:4, :])
def _untransform_gru_canonical(self, transformed_weights, transformed_biases): """The reverse procedure of _fuse_gru_canonical(). Args: transformed_weights: a list of tensors, 3 for each layer. The 1st for reset and update gates; the 2nd and 3rd for the new memory gate. transformed_biases: 5 tensors each layer. The first for reset_and_update gate; the next two in line for candidate gate. The last 2 are original tensors for reset_and_update gates, retained since cuDNN biases are not restorable from the fused version. Returns: Two lists of tensors for weights and biases respectively. There are 6 tensors per weight and per bias for each layer: tensor 0-2 are applied to the input from the previous layer and tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate; tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate. """ weights, biases = [], [] assert 5 * len(transformed_weights) == len(transformed_biases) * 3 for i in range(len(transformed_weights) // 3): base_idx = 3 * i num_units = self._cudnn_rnn.num_units input_size = self._cudnn_rnn.input_size if i == 0 else num_units # reset and update gate weights applied on layer inputs. w_i = array_ops.slice(transformed_weights[base_idx], [0, 0], [input_size, 2 * num_units]) # reset and update gate weights applied on recurrent inputs. w_r = array_ops.slice(transformed_weights[base_idx], [input_size, 0], [num_units, 2 * num_units]) wi_list = array_ops.split(w_i, 2, axis=1) wr_list = array_ops.split(w_r, 2, axis=1) wi_list = [_flatten_transpose(w) for w in wi_list] wr_list = [_flatten_transpose(w) for w in wr_list] # candidate gate weights ih, hh = [ _flatten_transpose(w) for w in transformed_weights[base_idx + 1:base_idx + 3] ] weights.extend(wi_list) weights.append(ih) weights.extend(wr_list) weights.append(hh) base_idx = 5 * i # Recover biases for reset and update gates. bi_list = array_ops.split(transformed_biases[base_idx + 3], 2, axis=0) br_list = array_ops.split(transformed_biases[base_idx + 4], 2, axis=0) biases.extend(bi_list) biases.append(transformed_biases[base_idx + 1]) biases.extend(br_list) biases.append(transformed_biases[base_idx + 2]) return weights, biases
def testExplicitNum(self): size_splits = array_ops.constant([2, 2, 6], dtype=dtypes.int32) value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # Eager and Graph modes raise different exceptions with self.assertRaises((errors_impl.InvalidArgumentError, ValueError)): array_ops.split(value, size_splits, num=4) r = self.evaluate(array_ops.split(value, size_splits, num=3)) self.assertAllEqual(r[0], value[0:2]) self.assertAllEqual(r[1], value[2:4]) self.assertAllEqual(r[2], value[4:])
def __call__(self, inputs, state, scope=None): """Recurrent Highway Network cell (RHN).""" with vs.variable_scope(scope or type(self).__name__): # "BasicRHNCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: y = state else: y = array_ops.split(1, 1, state) assert self._recurrence_depth > 0 and type(self._recurrence_depth) is int # h_transform = [None] * self._recurrence_depth # t = [None] * self._recurrence_depth # s = [None] * self._recurrence_depth # concat = [None] * self._recurrence_depth # for i in range(self._recurrence_depth): # if i == 0: # concat[i] = _linear([inputs, h], 2 * self._num_units, True) # # h = nonlinear transform, t = transfer gate # h_transform[i], t[i] = array_ops.split(1, 2, concat[i]) # t[i] = sigmoid(t[i] + self._transfer_bias) # s[i] = self._activation(h_transform[i]) * t[i] + \ # (1.0 - t[i]) * _linear([inputs], 1 * self._num_units, False) # if i > 0: # concat[i] = _linear([h], 2 * self._num_units, True) # # h = nonlinear transform, t = transfer gate # h_transform[i], t[i] = array_ops.split(1, 2, concat[i]) # t[i] = sigmoid(t[i] + self._transfer_bias) # s[i] = self._activation(h_transform[i]) * t[i] + \ # (1.0 - t[i]) * s[i-1] # ALTERNATIVE IMPLEMENTATION: for i in range(self._recurrence_depth): if i == 0: concat = _linear([inputs, y], 2 * self._num_units, True) # h = nonlinear transform, t = transfer gate h, t = array_ops.split(1, 2, concat) t = sigmoid(t + self._transfer_bias) s = self._activation(h) * t + \ (1.0 - t) * _linear([inputs], 1 * self._num_units, False) if i > 0: concat = _linear([s], 2 * self._num_units, True) # h = nonlinear transform, t = transfer gate h, t = array_ops.split(1, 2, concat) t = sigmoid(t + self._transfer_bias) s = self._activation(h) * t + \ (1.0 - t) * s new_y = s if self._state_is_tuple: new_state = RHNStateTuple(new_y) else: new_state = array_ops.concat(1, new_y) return new_y
def testNonexistentDimTensor(self): x = array_ops.placeholder(dtypes.int32) values = np.zeros([5, 30]) splits = array_ops.placeholder(dtypes.int32) with self.assertRaisesRegexp(ValueError, "Cannot infer"): y = array_ops.split(values, splits, axis=x) splits = array_ops.placeholder(dtypes.int32, [3]) y = array_ops.split(values, splits, axis=x) with self.test_session(use_gpu=True) as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "must have exactly one element"): sess.run(y, {x: np.array([], dtype=np.int32), splits: [4, 11, 15]})
def _padded_split(tensor, pieces): """Like split for 1D tensors but pads-out case where len % pieces != 0. Args: tensor: T `tf.Tensor` that must be 1D. pieces: a positive integer specifying the number of pieces into which tensor should be split. Returns: list of T `tf.Tensor` of length pieces, which hold the values of thin input tensor, in order. The final tensor may be zero-padded on the end to make its size equal to those of all of the other tensors. Raises: ValueError: The input tensor is not 1D. """ shape = tensor.shape if 1 != len(shape): raise ValueError("input tensor must be 1D") tensor_len = shape.dims[0].value with ops.colocate_with(tensor): if tensor_len % pieces != 0: # pad to an even length chunk_size = 1 + tensor_len // pieces if pieces > tensor_len: # This is an edge case that should not come up in practice, # i.e. a different reduction algorithm would be better, # but we'll make it work just for completeness. pad_len = pieces - tensor_len extended_whole = array_ops.concat( [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0) parts = array_ops.split(extended_whole, pieces) return parts, pad_len elif (pieces - 1) * chunk_size >= tensor_len: # Another edge case of limited real interest. pad_len = (pieces * chunk_size) % tensor_len extended_whole = array_ops.concat( [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0) parts = array_ops.split(extended_whole, pieces) return parts, pad_len else: last_chunk_size = tensor_len - (pieces - 1) * chunk_size pad_len = chunk_size - last_chunk_size piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size] parts = array_ops.split(tensor, piece_lens) parts[-1] = array_ops.concat( [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0) return parts, pad_len else: return array_ops.split(tensor, pieces), 0
def _tensor_to_sparse_feature_column(dense_tensor): """Returns SparseFeatureColumn for the input dense_tensor.""" ignore_value = 0.0 sparse_indices = array_ops.where( math_ops.not_equal(dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)) ) sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices) # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports # very sparse features with weights and not weights. return sdca_ops.SparseFeatureColumn( array_ops.reshape(array_ops.split(1, 2, sparse_indices)[0], [-1]), array_ops.reshape(array_ops.split(1, 2, sparse_indices)[1], [-1]), array_ops.reshape(math_ops.to_float(sparse_values), [-1]), )
def testShapeFunctionEdgeCases(self): # split_dim greater than rank of input. with self.assertRaises(ValueError): array_ops.split(value=[[0, 1], [2, 3]], num_or_size_splits=4, axis=2) # split dim less than -(rank of input) with self.assertRaises(ValueError): array_ops.split(value=[[0, 1], [2, 3]], num_or_size_splits=4, axis=-3) # num_split does not evenly divide the size in split_dim. with self.assertRaisesRegexp(ValueError, "should evenly divide"): array_ops.split(value=[0, 1, 2, 3], num_or_size_splits=3, axis=0) # Unknown split_dim. splits = array_ops.split( value=[[0, 1, 2, 3]], num_or_size_splits=4, axis=array_ops.placeholder(dtypes.int32)) for s in splits: self.assertEqual([None, None], s.get_shape().as_list()) # Unknown split_dim and input shape. splits = array_ops.split( value=array_ops.placeholder(dtypes.float32), num_or_size_splits=4, axis=array_ops.placeholder(dtypes.int32)) for s in splits: self.assertEqual(None, s.get_shape().ndims)
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) concat = linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * sigmoid(f + self._forget_bias) + sigmoid(i) * tanh(j) new_h = tanh(new_c) * sigmoid(o) return new_h, array_ops.concat(1, [new_c, new_h])
def __call__(self, inputs, state, scope): # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) self.W, self.b, concat = _linear([inputs, h], 4 * self._num_units, self.bias, self.W, self.b, self.W_init, trainable=self.trainable, scope=scope) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * self.activation(f + self._forget_bias) + self.activation( i) * self.inner_activation(j) new_h = self.inner_activation(new_c) * self.activation(o) return new_h, array_ops.concat(1, [new_c, new_h])
def _untransform_lstm_canonical(self, transformed_weights, transformed_biases): """The reverse procedure of _transform_lstm_canonical(). Args: transformed_weights: a list of tensors, one for each layer. transformed_biases: a list of tensors , 3 for each layer: the 2nd for layer input, the 3rd for recurrent input, the 1st is the sum of the latter two. Returns: Two lists of tensors for weights and biases respectively. There are 8 tensors per weight and per bias for each layer: tensor 0-3 are applied to the input from the previous layer; tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate; tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate; tensor 3 and 7 the output gate. """ weights, biases = [], [] assert 3 * len(transformed_weights) == len(transformed_biases) for i in range(len(transformed_weights)): num_units = self._cudnn_rnn.num_units input_size = self._cudnn_rnn.input_size if i == 0 else num_units # weights applied on layer inputs. wi = array_ops.slice(transformed_weights[i], [0, 0], [input_size, 4 * num_units]) # weights applied on recurrent inputs. wr = array_ops.slice(transformed_weights[i], [input_size, 0], [num_units, 4 * num_units]) wi_list = array_ops.split(wi, 4, axis=1) wr_list = array_ops.split(wr, 4, axis=1) for j in range(len(wi_list)): wi_list[j] = array_ops.reshape(array_ops.transpose(wi_list[j]), [-1]) wr_list[j] = array_ops.reshape(array_ops.transpose(wr_list[j]), [-1]) # canonical weights are in icfo order, convert to ifco order for cuDNN. self._switch_inner(wi_list, 0) self._switch_inner(wr_list, 0) weights.extend(wi_list) weights.extend(wr_list) base_idx = 3 * i bi_list = array_ops.split(transformed_biases[base_idx + 1], 4, axis=0) br_list = array_ops.split(transformed_biases[base_idx + 2], 4, axis=0) # canonical weights are in icfo order, convert to ifco order for cuDNN. self._switch_inner(bi_list, 0) self._switch_inner(br_list, 0) biases.extend(bi_list) biases.extend(br_list) return weights, biases
def tfsplits(_): """A more complex graph, including splits.""" x = array_ops.placeholder(dtypes.float32, shape=[2, 2], name='x') y = array_ops.placeholder(dtypes.float32, shape=[2, 2], name='y') for _ in range(3): x0, x1 = array_ops.split(x, 2, 0) y0, y1 = array_ops.split(y, 2, 0) x0 += 1 y0 += 1 z = math_ops.matmul(x, y, name='x_y_prod') a = array_ops.concat([x0, y1], axis=0, name='concat_x0_y1') b = array_ops.concat([y0, x1], axis=0, name='concat_y0_x1') x = math_ops.matmul(a, b, name='a_b') y = math_ops.add(x, z) array_ops.identity(y, name='result')
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32) b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32) # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1") # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2") # s3 = tf.Variable(tf.ones([self._num_units]), name="s3") # # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1") # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2") # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3") input_below_ = rnn_cell._linear([inputs], 4 * self._num_units, False, scope="out_1") input_below_ = ln(input_below_, s1, b1) state_below_ = rnn_cell._linear([h], 4 * self._num_units, False, scope="out_2") state_below_ = ln(state_below_, s2, b2) lstm_matrix = tf.add(input_below_, state_below_) i, j, f, o = array_ops.split(1, 4, lstm_matrix) new_c = (c * sigmoid(f) + sigmoid(i) * self._activation(j)) # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now. # new_c_ = ln(new_c, s3, b3) new_c_ = new_c new_h = self._activation(new_c_) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) return new_h, new_state
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r, u = array_ops.split(1, 2, _linear([inputs, state], 2 * self._num_units, True, 1.0, self.weights_init, self.trainable, self.restore, self.reuse)) r, u = self._inner_activation(r), self._inner_activation(u) with tf.variable_scope("Candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True, 0., self.weights_init, self.trainable, self.restore, self.reuse)) new_h = u * state + (1 - u) * c self.W, self.b = list(), list() # Retrieve RNN Variables with tf.variable_scope('Gates/Linear', reuse=True): self.W.append(tf.get_variable('Matrix')) self.b.append(tf.get_variable('Bias')) with tf.variable_scope('Candidate/Linear', reuse=True): self.W.append(tf.get_variable('Matrix')) self.b.append(tf.get_variable('Bias')) return new_h, new_h
def split(self, value, lengths, name=None): """See TensorArray.""" # error checking to match graph-mode errors value = constant_op.constant(value) lengths = constant_op.constant(lengths) sum_lengths = math_ops.reduce_sum(lengths) if lengths.shape.ndims != 1: raise errors_impl.InvalidArgumentError( None, None, "Expected lengths to be a vector, received shape: %s" % lengths.shape.as_list()) elif value.shape.ndims == 0: raise errors_impl.InvalidArgumentError( None, None, "Expected value to be at least a vector, " "but received shape: %s" % value.shape.as_list()) elif sum_lengths.numpy() != value.shape.as_list()[0]: raise errors_impl.InvalidArgumentError( None, None, "Expected sum of lengths to be equal to " "values.shape[0], but sum of lengths is %d and " "value's shape is: %s " % (sum_lengths.numpy(), value.shape.as_list())) elif not self._dynamic_size and lengths.shape[0] != len(self._tensor_array): raise errors_impl.InvalidArgumentError( None, None, "TensorArray's size is not equal to the size of " "lengths (%d vs. %d), and the TensorArray is not marked as " "dynamically resizeable" % (len(self._tensor_array), lengths.shape[0])) else: ta = self._identity_without_array() tensor_array = array_ops.split(value, lengths, name=name) ta._implementation._tensor_array = tensor_array # pylint: disable=protected-access return ta
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = inputs.dtype bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) # pylint: disable=protected-access value = math_ops.sigmoid( rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # pylint: enable=protected-access with vs.variable_scope("candidate"): # pylint: disable=protected-access with vs.variable_scope("input_projection"): hi = rnn_cell_impl._linear(inputs, self._num_units, True, self._bias_initializer, self._kernel_initializer) with vs.variable_scope("hidden_projection"): hh = r * (rnn_cell_impl._linear(state, self._num_units, True, self._bias_initializer, self._kernel_initializer)) # pylint: enable=protected-access c = self._activation(hi + hh) new_h = u * state + (1 - u) * c return new_h, new_h
def testReuse(self): def f(x): return core_layers.dense(x, self.CHANNELS // 2) def g(x): return core_layers.dense(x, self.CHANNELS // 2) x = random_ops.random_uniform( [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32) x1, x2 = array_ops.split(x, 2, axis=-1) with variable_scope.variable_scope("test"): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_before = len(variables.global_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after) loss = math_ops.reduce_mean(y1 + y2) _ = gradients_impl.gradients(loss, [x] + variables.trainable_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after)
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype actual_input_size = inputs.get_shape().as_list()[1] if self._input_size and self._input_size != actual_input_size: raise ValueError( "Actual input size not same as specified: %d vs %d." % actual_input_size, self._input_size) with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable("B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add( math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable("W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable("W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable("W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) return m, array_ops.concat(1, [c, m])
def call(self, inputs, states, training=None): """ inputs: shape is [batch , window_size, number_of_sensor, 1] """ h_state = states[0] # previous memory state c_state = states[1] # previous carry state #c_shape = c_tm1.get_shape().as_list() # [BATCH, conv_rest, 1, LAST_FILTER] # # dropout matrices for input units dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4) # dropout matrices for recurrent units rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(h_state, training, count=4) if 0 < self.dropout < 1.: x_i = inputs * dp_mask[0] x_f = inputs * dp_mask[1] x_c = inputs * dp_mask[2] x_o = inputs * dp_mask[3] else: x_i = inputs x_f = inputs x_c = inputs x_o = inputs if 0 < self.recurrent_dropout < 1.: h_i = h_state * rec_dp_mask[0] h_f = h_state * rec_dp_mask[1] h_c = h_state * rec_dp_mask[2] h_o = h_state * rec_dp_mask[3] else: h_i = h_state h_f = h_state h_c = h_state h_o = h_state for index in range(self.number_of_layer): # weights for inputs in FOUR GATES (kernel_i, kernel_f, kernel_c, kernel_o) = array_ops.split(self.kernel[index], 4, axis=3) # weights for hidden states in FOUR GATES if index == self.number_of_layer - 1: (recurrent_kernel_i, recurrent_kernel_f, recurrent_kernel_c, recurrent_kernel_o, recurrent_kernel_c_1) = array_ops.split( self.recurrent_kernel[index], 5, axis=3) else: (recurrent_kernel_i, recurrent_kernel_f, recurrent_kernel_c, recurrent_kernel_o) = array_ops.split( self.recurrent_kernel[index], 4, axis=3) ####################################################################################### # weights for BIAS in FOUR GATES if self.use_bias: bias_i, bias_f, bias_c, bias_o = array_ops.split( self.bias[index], 4) else: bias_i, bias_f, bias_c, bias_o = None, None, None, None x_i = self.input_conv(x_i, kernel_i, bias_i, padding=self.padding) x_f = self.input_conv(x_f, kernel_f, bias_f, padding=self.padding) x_c = self.input_conv(x_c, kernel_c, bias_c, padding=self.padding) x_o = self.input_conv(x_o, kernel_o, bias_o, padding=self.padding) h_i = self.recurrent_conv(h_i, recurrent_kernel_i) h_f = self.recurrent_conv(h_f, recurrent_kernel_f) h_c = self.recurrent_conv(h_c, recurrent_kernel_c) h_o = self.recurrent_conv(h_o, recurrent_kernel_o) if index == self.number_of_layer - 1: ####################################################################################### c_c = self.recurrent_conv(c_state, recurrent_kernel_c_1) i = self.recurrent_activation[index](x_i + h_i) f = self.recurrent_activation[index](x_f + h_f) o = self.recurrent_activation[index](x_o + h_o) c = f * c_c + i * self.activation(x_c + h_c) h = o * self.activation(c) else: x_i = self.conv_activation[index](x_i) x_f = self.conv_activation[index](x_f) x_c = self.conv_activation[index](x_c) x_o = self.conv_activation[index](x_o) h_i = self.recurrent_activation[index](h_i) h_f = self.recurrent_activation[index](h_f) h_c = self.recurrent_activation[index](h_c) h_o = self.recurrent_activation[index](h_o) if index == 1: self.data_format = "channels_last" self.data_format = None return h, [h, c]
def frechet_classifier_distance(real_images, generated_images, classifier_fn, num_batches=1): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_images: Real images to use to compute Frechet Inception distance. generated_images: Generated images to use to compute Frechet Inception distance. classifier_fn: A function that takes images and produces activations based on a classifier. num_batches: Number of batches to split images in to in order to efficiently run them through the classifier network. Returns: The Frechet Inception distance. A floating-point scalar. """ real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) imgs = array_ops.stack(real_images_list + generated_images_list) # Compute the activations using the memory-efficient `map_fn`. activations = functional_ops.map_fn( fn=classifier_fn, elems=imgs, parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') # Split the activations by the real and generated images. real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) gen_a = array_ops.concat(array_ops.unstack(gen_a), 0) real_a.shape.assert_has_rank(2) gen_a.shape.assert_has_rank(2) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_a, 0) m_v = math_ops.reduce_mean(gen_a, 0) num_examples = math_ops.to_float(array_ops.shape(real_a)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T sigma = math_ops.matmul( real_a - m, real_a - m, transpose_a=True) / (num_examples - 1) sigma_v = math_ops.matmul( gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_v)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_v) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean return fid
def _bag_features(self, tree_num, input_data): split_data = array_ops.split( value=input_data, num_or_size_splits=self.params.num_features, axis=1) return array_ops.concat( [split_data[ind] for ind in self.params.bagged_features[tree_num]], 1)
def call(self, inputs, state): """Long short-term memory cell (LSTM). Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) gate_inputs = math_ops.matmul(array_ops.concat([inputs, h], 1), self._kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) f_master_gate = _cumsoftmax(gate_inputs[:, :self._levels], 'l2r') f_master_gate = array_ops.expand_dims(f_master_gate, 2) i_master_gate = _cumsoftmax( gate_inputs[:, self._levels:self._levels * 2], 'r2l') i_master_gate = array_ops.expand_dims(i_master_gate, 2) gate_inputs = gen_array_ops.reshape( gate_inputs[:, self._levels * 2:], [-1, self._levels * 4, self._chunk_size]) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one) forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) # Note that using `add` and `multiply` instead of `+` and `*` gives a # performance improvement. So using those at the cost of readability. add = math_ops.add multiply = math_ops.multiply overlap = multiply(f_master_gate, i_master_gate) c = gen_array_ops.reshape(c, [-1, self._levels, self._chunk_size]) new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) new_c = add( add(multiply(overlap, new_c), multiply((f_master_gate - overlap), c)), multiply((i_master_gate - overlap), self._activation(j))) new_h = multiply(self._activation(new_c), sigmoid(o)) new_c = gen_array_ops.reshape(new_c, [-1, self._num_units]) new_h = gen_array_ops.reshape(new_h, [-1, self._num_units]) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index): """Gradient for concat op. Args: op: An operation. grad: `Tensor` or `IndexedSlices` representing the gradients with respect to each output of the op. start_value_index: An integer index of the first value in the op.inputs. end_value_index: An integer index of the last value in the op.inputs. dim_index: An interger index of concat_dim or axis parameter in op.inputs. Returns: Tensors represending the partial gradients with respect to each input of the op. Raises: ValueError: if concat_dim/axis is not statically known. """ def _CreateDenseMaskAndBegin(sizes, concat_dim): """Create variables for iteratively slicing a dense gradients tensor.""" # Since shape is 1-D, shape_of_shape = [rank-of-inputs] shape_of_shape = array_ops.shape(sizes[0]) # Make a vector of length equal to the input's dimensions, # with 0's everywhere and 1 in the concat dim position. # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now) mask = array_ops.concat([ array_ops.fill(array_ops.expand_dims(concat_dim, 0), 0), [1], array_ops.fill(shape_of_shape - concat_dim - 1, 0) ], 0) begin = array_ops.fill(shape_of_shape, 0) return mask, begin def _ExtractInputShapes(inputs): """Extract the shapes of a set of input tensors.""" sizes = [] fully_known = True for x in inputs: input_shape = array_ops.shape(x) if not isinstance(input_shape, ops.Tensor) or input_shape.op.type != "Const": fully_known = False break else: sizes.append(input_shape) if fully_known: return sizes else: return array_ops.shape_n(inputs) # Degenerate concatenation, just return grad. if len(op.inputs) == 2: return grad + [None] if end_value_index <= dim_index else [None] + grad concat_dim = op.inputs[dim_index] input_values = op.inputs[start_value_index:end_value_index] # Using mod here for convenience since concat_dim is already verified # in concat implementation to be within the allowed [-rank, rank) range. non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0]) out_grads = [] if isinstance(grad, ops.Tensor): # Get the inputs' tensor shapes sizes = _ExtractInputShapes(input_values) # The magic number of 16 was found through benchmarking a range of sizes # on CPUs and a Maxwell TitanX. A speedup was seen in a large majority of # cases when switching implementations at N=16, but it is possible that # there will be a small number of performance regressions. # pylint: disable=protected-access if len(sizes) > 16: # extract the size of each input along the concat dimension sizes = array_ops.squeeze( array_ops.slice(array_ops.stack(sizes, axis=1), [non_neg_concat_dim, 0], [1, -1])) out_grads = array_ops.split(grad, sizes, non_neg_concat_dim) else: offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes) for (begin, size) in zip(offset, sizes): out_grads.append(array_ops.slice(grad, begin, size)) # pylint: enable=protected-access elif isinstance(grad, ops.IndexedSlices): concat_dim_static = tensor_util.constant_value(concat_dim) if concat_dim_static is None: raise ValueError("Can only compute IndexedSlices gradient with " "statically-known concat_dim") if concat_dim_static < 0: rank = tensor_util.constant_value(array_ops.rank(input_values[0])) if rank is None: raise ValueError( "Can only compute IndexedSlices gradient with " "negative concat_dim when first value rank is " "statically-known.") concat_dim_static %= rank # Get the inputs' tensor shapes sizes = [array_ops.shape(x) for x in input_values] if concat_dim_static > 0: # IndexedSlices, non_neg_concat_dim > 0. Each input gets IndexedSlices # gradients with all the indices, but with grad.values sliced accordingly. # This is like the Tensor case, except shape(grad.values)[0] is not equal # to shape(sizes[i])[0], since only a subset of the dim-0 values are # stored. mask, begin = _CreateDenseMaskAndBegin(sizes, non_neg_concat_dim) for size in sizes: new_values = array_ops.slice( grad.values, begin, array_ops.concat( [[-1], array_ops.slice(size, [1], [-1])], 0)) out_grads.append( ops.IndexedSlices(new_values, grad.indices, size)) # Lint complains begin = begin + ... begin = math_ops.add(begin, size * mask) else: # IndexedSlices, concat_dim == 0. Each input gets IndexedSlices gradients # only for the relevant indices. start = constant_op.constant(0, dtype=grad.indices.dtype) for size in sizes: size_concat_dim = array_ops.gather(size, non_neg_concat_dim) if size_concat_dim.dtype != grad.indices.dtype: size_concat_dim = math_ops.cast(size_concat_dim, dtype=grad.indices.dtype) end = start + size_concat_dim # Compute the 1-D Tensor of indices relevant for this input. indices_to_select = array_ops.squeeze(array_ops.where( math_ops.logical_and(grad.indices >= start, grad.indices < end)), squeeze_dims=[1]) new_indices = array_ops.gather(grad.indices, indices_to_select) - start new_values = array_ops.gather(grad.values, indices_to_select) out_grads.append( ops.IndexedSlices(new_values, new_indices, size)) start = end else: raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad)) return (out_grads + [None] if end_value_index <= dim_index else [None] + out_grads)
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" i_size = input_size.value - 1 # -1 to extract time times = array_ops.slice(inputs, [0, i_size], [-1, 1]) filtered_inputs = array_ops.slice(inputs, [0, 0], [-1, i_size]) # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ---------------- BEGIN ---------------- # # --------------------------------------- # tau = vs.get_variable("T", shape=[self._num_units], initializer=random_exp_initializer( 0, self.tau_init), dtype=dtype) r_on = vs.get_variable("R", shape=[self._num_units], initializer=init_ops.constant_initializer( self.r_on_init), dtype=dtype) s = vs.get_variable( "S", shape=[self._num_units], initializer=init_ops.random_uniform_initializer( 0., tau.initialized_value()), dtype=dtype) # for backward compatibility (v < 0.12.0) use the following line instead of the above # initializer = init_ops.random_uniform_initializer(0., tau), dtype = dtype) tau_broadcast = tf.expand_dims(tau, dim=0) r_on_broadcast = tf.expand_dims(r_on, dim=0) s_broadcast = tf.expand_dims(s, dim=0) r_on_broadcast = tf.abs(r_on_broadcast) tau_broadcast = tf.abs(tau_broadcast) times = tf.tile(times, [1, self._num_units]) # calculate kronos gate phi = tf.div( tf.mod( tf.mod(times - s_broadcast, tau_broadcast) + tau_broadcast, tau_broadcast), tau_broadcast) is_up = tf.less(phi, (r_on_broadcast * 0.5)) is_down = tf.logical_and(tf.less(phi, r_on_broadcast), tf.logical_not(is_up)) k = tf.select( is_up, phi / (r_on_broadcast * 0.5), tf.select(is_down, 2. - 2. * (phi / r_on_broadcast), self.alpha * phi)) # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ----------------- END ----------------- # # --------------------------------------- # concat_w = _get_concat_variable( "W", [i_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable("B", shape=[4 * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [filtered_inputs, m_prev]) lstm_matrix = nn_ops.bias_add( math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable("W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable("W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable("W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = tf.math_ops.matmul(m, concat_w_proj) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type # APPLY KRONOS GATE c = k * c + (1. - k) * c_prev m = k * m + (1. - k) * m_prev # END KRONOS GATE new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state
def call(self, inputs, states, training=None): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state ############################################ c_shape = c_tm1.get_shape().as_list() c_shape #print(h_tm1) #print('c_tm1',c_tm1) #if h_tm1.shape ==c_tm1.shape # dropout matrices for input units dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4) # dropout matrices for recurrent units rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(h_tm1, training, count=4) if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 (kernel_i, kernel_f, kernel_c, kernel_o) = array_ops.split(self.kernel, 4, axis=3) ################################## (cov_kernel_i, cov_kernel_f, cov_kernel_c, cov_kernel_o) = array_ops.split(self.cov_kernel, 4, axis=3) #print ('cov_kernel_i',cov_kernel_i) ########################################## (recurrent_kernel_i, recurrent_kernel_f, recurrent_kernel_c, recurrent_kernel_o) = array_ops.split(self.recurrent_kernel, 4, axis=3) ######################################################### (recurrent_kernel_i_c, recurrent_kernel_f_c, recurrent_kernel_c_c, recurrent_kernel_o_c, recurrent_kernel_c_1) = array_ops.split(self.cov_recurrent_kernel, 5, axis=3) ############################################################ if self.use_bias: bias_i, bias_f, bias_c, bias_o = array_ops.split(self.bias, 4) else: bias_i, bias_f, bias_c, bias_o = None, None, None, None x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding) x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding) x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding) x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding) ################################################# #x_i_c = x_i #print('---------------------------------------') #print('x_f', x_f) #print('inputs_f',inputs_f,'kernel_f',kernel_f) #print('inputs_i',inputs_i,'kernel_i',kernel_i) #print('x_i', x_i,'cov_kernel_i',cov_kernel_i) #print('---------------------------------------') x_i_c = self.input_conv_u(x_i, cov_kernel_i, padding=self.padding) x_f_c = self.input_conv_u(x_f, cov_kernel_f, padding=self.padding) x_c_c = self.input_conv_u(x_c, cov_kernel_c, padding=self.padding) x_o_c = self.input_conv_u(x_o, cov_kernel_o, padding=self.padding) #print('x_i',x_i,'cov_kernel_i',cov_kernel_i) #print('x_i_c', x_i_c) ############################################################ #print('h_tm1_i',h_tm1_i,'recurrent_kernel_i',recurrent_kernel_i) h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i) h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f) h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c) h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o) ##################################################### #print('h_i',h_i,'recurrent_kernel_i_c',recurrent_kernel_i_c) #print('---------------------------------------------') #print('h_i',h_i,'recurrent_kernel_i_c',recurrent_kernel_i_c) #print('---------------------------------------------') h_i_c = self.recurrent_conv_u(h_i, recurrent_kernel_i_c) h_f_c = self.recurrent_conv_u(h_f, recurrent_kernel_f_c) h_c_c = self.recurrent_conv_u(h_c, recurrent_kernel_c_c) h_o_c = self.recurrent_conv_u(h_o, recurrent_kernel_o_c) c_c = self.recurrent_conv_u(c_tm1, recurrent_kernel_c_1) ############################################################## #print('x_f_c',x_f_c,'h_f_c',h_f_c) #print('x_i_c',x_i_c,'h_i_c',h_i_c) i = self.recurrent_activation(x_i_c + h_i_c) #print('i',i) f = self.recurrent_activation(x_f_c + h_f_c) #print('f',f,'c_c',c_c) c = f * c_c + i * self.activation(x_c_c + h_c_c) o = self.recurrent_activation(x_o_c + h_o_c) h = o * self.activation(c) return h, [h, c]
def loop_fn(i): x1 = array_ops.gather(x, i) return (array_ops.split(x1, [2, 1, 3], axis=0), array_ops.split(x1, [3], axis=-1))
def pack(self, grouped_grads_and_vars): """Pack tensors.""" self.grouped_grads_and_vars = grouped_grads_and_vars self.all_device_shapes = [] self.all_device_sizes = [] device_grad_packs = [] for device_grads_and_vars in grouped_grads_and_vars: with ops.colocate_with(device_grads_and_vars[0][0]): # Flatten all the grads. flat_grads = [ array_ops.reshape(g, [-1]) for g, _ in device_grads_and_vars ] # Remember the original shape of all the grads. device_shapes = [ array_ops.shape(g) for g, _ in device_grads_and_vars ] # Remember the original sizes of all the grads. device_sizes = [ array_ops.size(g) for g, _ in device_grads_and_vars ] # Concat all the flat grads into a big flat tensor. concat_grads = array_ops.concat(flat_grads, 0) # Split the big tensor into num_splits packs. In cases where the # total size is not divisible num_splits, the last pack gets # more elements. # TODO(zhengxq): it is also possible to optimize away all the concat # as well. num_splits = self.num_packs # The array_ops.size function will sometimes remove static shapes. So if # all gradient shapes are defined, we use another method to get the # total size. # TODO(yuefengz): move this logic to array_ops.size. if all(g.shape.is_fully_defined() for g, _ in device_grads_and_vars): total_grad_size = sum([ g.shape.num_elements() for g, _ in device_grads_and_vars ]) else: total_grad_size = array_ops.size(concat_grads) split_size = total_grad_size // num_splits split_size_last = total_grad_size - split_size * (num_splits - 1) split_sizes = [split_size] * (num_splits - 1) + [ split_size_last ] grad_packs = array_ops.split(concat_grads, split_sizes) # Ready to aggregate the repacked gradients, with fake variables. # TODO(zhengxq): It is hacky to have to use fake variables. # We should remove the need for variables in # aggregate_gradients_using*. device_grad_packs.append(zip(grad_packs, [None] * num_splits)) self.all_device_shapes.append(device_shapes) self.all_device_sizes.append(device_sizes) return device_grad_packs
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, `[batch, num_units]. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, [batch, state_size]`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch, output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = math_ops.matmul( array_ops.concat([inputs, m_prev], 1), self._masked_kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: c = ( sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = ( sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = ( tf_rnn.LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def odd_input(off, size): helper, off = array_ops.split(off, [1, size - 1], 1) size -= 1 off = even_input(off, size) off = array_ops.concat([helper, off], 1) return off
def split_fn(inp, num_shards, axis, name): with ops.colocate_with(inp): return array_ops.split(inp, num_shards, axis=axis, name=name)
def batch_all_reduce(self, input_tensor_packs, communication_hint='AUTO', timeout=0): """Batch all-reduce dense tensors. This takes a list of batches of tensors. Using multiple batches have the benefit that it doesn't need to wait for all inputs to be ready to start the all-reduce. This can be called in eager mode if a async executor is supplied when creating the launcher. Args: input_tensor_packs: a list of lists of dense tensors. communication_hint: string providing hint to runtime for choosing collective implementation. timeout: a float. The timeout in seconds. Returns: A flat list of reduced tensors. """ # We don't batch with concat in eager. It's easy to get it wrong because # we need to avoid any numpy() calls on values produced by the async # executor. This effectively disables batching in eager, but it's unlikely # to all-reduce a large number of tensors in eager. batch_with_concat = (not self._use_scoped_allocator() and not context.executing_eagerly()) outputs = [] for pack in input_tensor_packs: # TODO(b/169168846): inserts a parallel all_gather to verify packings # are the same on each replica. if batch_with_concat: with ops.device(self._device): flat_tensors = [array_ops.reshape(t, [-1]) for t in pack] shapes = [array_ops.shape(t) for t in pack] if communication_hint == 'NCCL' and outputs: control_input = outputs[-1] else: control_input = None reduced = self.all_reduce( array_ops.concat(flat_tensors, axis=0), control_input, communication_hint, timeout) num_elements = [math_ops.reduce_prod(s) for s in shapes] flat_outputs = array_ops.split(reduced, num_elements, axis=0) for shape, flat_output in zip(shapes, flat_outputs): outputs.append(array_ops.reshape(flat_output, shape)) else: # By placing all CollectiveReduce ops in a batch under single name # scope, we ensure they will be picked up by the `ScopedAllocator` # grappler optimizer and packed into a single all-reduce. with ops.name_scope('allreduce'): for input_tensor in pack: if communication_hint == 'NCCL' and outputs: control_input = outputs[-1] else: control_input = None outputs.append( self.all_reduce(input_tensor, control_input, communication_hint, timeout)) return outputs
def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1, name=None): """Expands `signal`'s `axis` dimension into frames of `frame_length`. Slides a window of size `frame_length` over `signal`'s `axis` dimension with a stride of `frame_step`, replacing the `axis` dimension with `[frames, frame_length]` frames. If `pad_end` is True, window positions that are past the end of the `axis` dimension are padded with `pad_value` until the window moves fully past the end of the dimension. Otherwise, only window positions that fully overlap the `axis` dimension are produced. For example: ```python # A batch size 3 tensor of 9152 audio samples. audio = tf.random.normal([3, 9152]) # Compute overlapping frames of length 512 with a step of 180 (frames overlap # by 332 samples). By default, only 50 frames are generated since the last # 152 samples do not form a full frame. frames = tf.signal.frame(audio, 512, 180) frames.shape.assert_is_compatible_with([3, 50, 512]) # When pad_end is enabled, the final frame is kept (padded with zeros). frames = tf.signal.frame(audio, 512, 180, pad_end=True) frames.shape.assert_is_compatible_with([3, 51, 512]) ``` Args: signal: A `[..., samples, ...]` `Tensor`. The rank and dimensions may be unknown. Rank must be at least 1. frame_length: The frame length in samples. An integer or scalar `Tensor`. frame_step: The frame hop size in samples. An integer or scalar `Tensor`. pad_end: Whether to pad the end of `signal` with `pad_value`. pad_value: An optional scalar `Tensor` to use where the input signal does not exist when `pad_end` is True. axis: A scalar integer `Tensor` indicating the axis to frame. Defaults to the last axis. Supports negative values for indexing from the end. name: An optional name for the operation. Returns: A `Tensor` of frames with shape `[..., frames, frame_length, ...]`. Raises: ValueError: If `frame_length`, `frame_step`, `pad_value`, or `axis` are not scalar. """ with ops.name_scope(name, "frame", [signal, frame_length, frame_step, pad_value]): signal = ops.convert_to_tensor(signal, name="signal") frame_length = ops.convert_to_tensor(frame_length, name="frame_length") frame_step = ops.convert_to_tensor(frame_step, name="frame_step") axis = ops.convert_to_tensor(axis, name="axis") signal.shape.with_rank_at_least(1) frame_length.shape.assert_has_rank(0) frame_step.shape.assert_has_rank(0) axis.shape.assert_has_rank(0) result_shape = _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis) def maybe_constant(val): val_static = tensor_util.constant_value(val) return (val_static, True) if val_static is not None else (val, False) signal_shape, signal_shape_is_static = maybe_constant( array_ops.shape(signal)) axis, axis_is_static = maybe_constant(axis) if signal_shape_is_static and axis_is_static: # Axis can be negative. Convert it to positive. axis = range(len(signal_shape))[axis] outer_dimensions, length_samples, inner_dimensions = np.split( signal_shape, indices_or_sections=[axis, axis + 1]) length_samples = length_samples.item() else: signal_rank = array_ops.rank(signal) # Axis can be negative. Convert it to positive. axis = math_ops.range(signal_rank)[axis] outer_dimensions, length_samples, inner_dimensions = array_ops.split( signal_shape, [axis, 1, signal_rank - 1 - axis]) length_samples = array_ops.reshape(length_samples, []) num_outer_dimensions = array_ops.size(outer_dimensions) num_inner_dimensions = array_ops.size(inner_dimensions) # If padding is requested, pad the input signal tensor with pad_value. if pad_end: pad_value = ops.convert_to_tensor(pad_value, signal.dtype) pad_value.shape.assert_has_rank(0) # Calculate number of frames, using double negatives to round up. num_frames = -(-length_samples // frame_step) # Pad the signal by up to frame_length samples based on how many samples # are remaining starting from last_frame_position. pad_samples = math_ops.maximum( 0, frame_length + frame_step * (num_frames - 1) - length_samples) # Pad the inner dimension of signal by pad_samples. paddings = array_ops.concat( [array_ops.zeros([num_outer_dimensions, 2], dtype=pad_samples.dtype), [[0, pad_samples]], array_ops.zeros([num_inner_dimensions, 2], dtype=pad_samples.dtype)], 0) signal = array_ops.pad(signal, paddings, constant_values=pad_value) signal_shape = array_ops.shape(signal) length_samples = signal_shape[axis] else: num_frames = math_ops.maximum( 0, 1 + (length_samples - frame_length) // frame_step) subframe_length, _ = maybe_constant(util_ops.gcd(frame_length, frame_step)) subframes_per_frame = frame_length // subframe_length subframes_per_hop = frame_step // subframe_length num_subframes = length_samples // subframe_length slice_shape = array_ops.concat([outer_dimensions, [num_subframes * subframe_length], inner_dimensions], 0) subframe_shape = array_ops.concat([outer_dimensions, [num_subframes, subframe_length], inner_dimensions], 0) subframes = array_ops.reshape(array_ops.strided_slice( signal, array_ops.zeros_like(signal_shape), slice_shape), subframe_shape) # frame_selector is a [num_frames, subframes_per_frame] tensor # that indexes into the appropriate frame in subframes. For example: # [[0, 0, 0, 0], [2, 2, 2, 2], [4, 4, 4, 4]] frame_selector = array_ops.reshape( math_ops.range(num_frames) * subframes_per_hop, [num_frames, 1]) # subframe_selector is a [num_frames, subframes_per_frame] tensor # that indexes into the appropriate subframe within a frame. For example: # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]] subframe_selector = array_ops.reshape( math_ops.range(subframes_per_frame), [1, subframes_per_frame]) # Adding the 2 selector tensors together produces a [num_frames, # subframes_per_frame] tensor of indices to use with tf.gather to select # subframes from subframes. We then reshape the inner-most # subframes_per_frame dimension to stitch the subframes together into # frames. For example: [[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7]]. selector = frame_selector + subframe_selector frames = array_ops.reshape( array_ops.gather(subframes, selector, axis=axis), array_ops.concat([outer_dimensions, [num_frames, frame_length], inner_dimensions], 0)) if result_shape: frames.set_shape(result_shape) return frames
def __call__(self, inputs, state, scope=None): """Run one step of G-LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: not used Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the G-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of G-LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") dtype = inputs.dtype with vs.variable_scope(scope or "glstm_cell", initializer=self._initializer): i_parts = [] j_parts = [] f_parts = [] o_parts = [] for group_id in xrange(self._number_of_groups): with vs.variable_scope("group%d"%group_id): x_g_id = array_ops.concat([self._get_input_for_group(inputs, group_id, self._group_shape[0]), self._get_input_for_group(m_prev, group_id, self._group_shape[0])], axis=1) R_k = linear(x_g_id, 4 * self._group_shape[1], bias=False, scope=scope) #will add per gate biases later i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1) i_parts.append(i_k) j_parts.append(j_k) f_parts.append(f_k) o_parts.append(o_k) #it is more efficient to have per gate biases then per gate, per group bi = vs.get_variable(name="biases_i", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) bj = vs.get_variable(name="biases_j", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) bf = vs.get_variable(name="biases_f", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) bo = vs.get_variable(name="biases_o", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi) j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj) f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf) o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo) c = math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid(i) * math_ops.tanh(j) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): m = linear(m, self._num_proj, bias=False, scope=scope) new_state = LSTMStateTuple(c, m) return m, new_state
def call(self, inputs, states, constants, training=False): org = inputs constants = states + list(constants) inputs = self.input_norm(inputs) # import pdb; pdb.set_trace() h_tm1 = constants[0] # previous memory dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3) if 0. < self.dropout < 1.: inputs = inputs * dp_mask[0] # inputs = self.split_heads_2to3(inputs, self.heads)a # multi_h_tm1 = self.split_heads_2to3(h_tm1, self.heads) matrix_x = K.dot(inputs, self.kernel) matrix_inner = K.dot(h_tm1, self.recurrent_kernel) # matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units]) if self.use_bias: # input_bias = self.split_heads_2to3(input_bias) # recurrent_bias = self.split_heads_2to3(recurrent_bias) matrix_x = K.bias_add(matrix_x, self.bias) x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=-1) recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3, axis=-1) if constants[1] is not None: attention_context = self.heads_attention_wrapper( h_tm1, constants, training) att_z, att_r, att_h = array_ops.split(attention_context, 3, axis=-1) org_z = x_z + recurrent_z + att_z # org_z = self.output_z(z) # z = self.z_heads_filter(self.z_layer_norm(org_z)) z = self.z_heads_filter(self.z_layer_norm(org_z)) if 0. < self.dropout < 1.: z = z * dp_mask[0] z = z + org_z r = x_r + recurrent_r + att_r # r = self.r_heads_filter(r) r = self.r_heads_filter(r) # recurrent_h = K.dot(r * h_tm1, # self.recurrent_kernel[:, 2 * self.units:]) org_hh = x_h + r * recurrent_h + att_h # hh = self.h_heads_filter(self.hh_layer_norm(org_hh)) hh = self.h_heads_filter(self.hh_layer_norm(org_hh)) if 0. < self.dropout < 1.: hh = hh * dp_mask[0] hh = hh + org_hh # hh = self.hh_layer_norm(hh) else: z = self.z_heads_filter(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) hh = self.h_heads_filter(x_h + r * recurrent_h) # previous and candidate state mixed by update gate h = z * h_tm1 + (1 - z) * hh # h = tf.reshape(h, [-1, self.units]) # h = self.heads_filter(h) # h = self.heads_filter(h) # if training: # h = tf.nn.dropout(h, self.dropout) # if 0. < self.dropout < 1.: # h = h * dp_mask[0] # org = org + h # h = self.heads_filter(self.h_norm(org)) if training: # h = tf.nn.dropout(h,self.dropout) h = h * dp_mask[0] h = org + h # # last = constants[0] + h # # # h = self.heads_filter(self.layer_norm_filter(org), training) # if training: # h = tf.nn.dropout(h, self.dropout) # h = self.layer_norm(h + org) # c = constants[1] + c return h, [h]
def linear_to_mel_weight_matrix(num_mel_bins=20, num_spectrogram_bins=129, sample_rate=8000, lower_edge_hertz=125.0, upper_edge_hertz=3800.0, dtype=dtypes.float32, name=None): """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel]. Returns a weight matrix that can be used to re-weight a `Tensor` containing `num_spectrogram_bins` linearly sampled frequency information from `[0, sample_rate / 2]` into `num_mel_bins` frequency information from `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel]. For example, the returned matrix `A` can be used to right-multiply a spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram" `M` of shape `[frames, num_mel_bins]`. # `S` has shape [frames, num_spectrogram_bins] # `M` has shape [frames, num_mel_bins] M = tf.matmul(S, A) The matrix can be used with `tf.tensordot` to convert an arbitrary rank `Tensor` of linear-scale spectral bins into the mel scale. # S has shape [..., num_spectrogram_bins]. # M has shape [..., num_mel_bins]. M = tf.tensordot(S, A, 1) # tf.tensordot does not support shape inference for this case yet. M.set_shape(S.shape[:-1].concatenate(A.shape[-1:])) Args: num_mel_bins: Python int. How many bands in the resulting mel spectrum. num_spectrogram_bins: An integer `Tensor`. How many bins there are in the source spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the spectrogram only contains the nonredundant FFT bins. sample_rate: Python float. Samples per second of the input signal used to create the spectrogram. We need this to figure out the actual frequencies for each spectrogram bin, which dictates how they are mapped into the mel scale. lower_edge_hertz: Python float. Lower bound on the frequencies to be included in the mel spectrum. This corresponds to the lower edge of the lowest triangular band. upper_edge_hertz: Python float. The desired top edge of the highest frequency band. dtype: The `DType` of the result matrix. Must be a floating point type. name: An optional name for the operation. Returns: A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`. Raises: ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not positive, lower_edge_hertz is negative, frequency edges are incorrectly ordered, or upper_edge_hertz is larger than the Nyquist frequency. [mel]: https://en.wikipedia.org/wiki/Mel_scale """ with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name: # Note: As num_spectrogram_bins is passed to `math_ops.linspace` # and the validation is already done in linspace (both in shape function # and in kernel), there is no need to validate num_spectrogram_bins here. _validate_arguments(num_mel_bins, sample_rate, lower_edge_hertz, upper_edge_hertz, dtype) # This function can be constant folded by graph optimization since there are # no Tensor inputs. sample_rate = ops.convert_to_tensor(sample_rate, dtype, name='sample_rate') lower_edge_hertz = ops.convert_to_tensor(lower_edge_hertz, dtype, name='lower_edge_hertz') upper_edge_hertz = ops.convert_to_tensor(upper_edge_hertz, dtype, name='upper_edge_hertz') zero = ops.convert_to_tensor(0.0, dtype) # HTK excludes the spectrogram DC bin. bands_to_zero = 1 nyquist_hertz = sample_rate / 2.0 linear_frequencies = math_ops.linspace( zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:] spectrogram_bins_mel = array_ops.expand_dims( _hertz_to_mel(linear_frequencies), 1) # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The # center of each band is the lower and upper edge of the adjacent bands. # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into # num_mel_bins + 2 pieces. band_edges_mel = shape_ops.frame(math_ops.linspace( _hertz_to_mel(lower_edge_hertz), _hertz_to_mel(upper_edge_hertz), num_mel_bins + 2), frame_length=3, frame_step=1) # Split the triples up and reshape them into [1, num_mel_bins] tensors. lower_edge_mel, center_mel, upper_edge_mel = tuple( array_ops.reshape(t, [1, num_mel_bins]) for t in array_ops.split(band_edges_mel, 3, axis=1)) # Calculate lower and upper slopes for every spectrogram bin. # Line segments are linear in the mel domain, not Hertz. lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (center_mel - lower_edge_mel) upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (upper_edge_mel - center_mel) # Intersect the line segments with each other and zero. mel_weights_matrix = math_ops.maximum( zero, math_ops.minimum(lower_slopes, upper_slopes)) # Re-add the zeroed lower bins we sliced out above. return array_ops.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]], name=name)
def _split(self, params): return array_ops.split(params, 2, axis=-1)
def inverse_mdct(mdcts, window_fn=window_ops.vorbis_window, norm=None, name=None): """Computes the inverse modified DCT of `mdcts`. To reconstruct an original waveform, the same window function should be used with `mdct` and `inverse_mdct`. Example usage: >>> @tf.function ... def compare_round_trip(): ... samples = 1000 ... frame_length = 400 ... halflen = frame_length // 2 ... waveform = tf.random.normal(dtype=tf.float32, shape=[samples]) ... waveform_pad = tf.pad(waveform, [[halflen, 0],]) ... mdct = tf.signal.mdct(waveform_pad, frame_length, pad_end=True, ... window_fn=tf.signal.vorbis_window) ... inverse_mdct = tf.signal.inverse_mdct(mdct, ... window_fn=tf.signal.vorbis_window) ... inverse_mdct = inverse_mdct[halflen: halflen + samples] ... return waveform, inverse_mdct >>> waveform, inverse_mdct = compare_round_trip() >>> np.allclose(waveform.numpy(), inverse_mdct.numpy(), rtol=1e-3, atol=1e-4) True Implemented with TPU/GPU-compatible ops and supports gradients. Args: mdcts: A `float32`/`float64` `[..., frames, frame_length // 2]` `Tensor` of MDCT bins representing a batch of `frame_length // 2`-point MDCTs. window_fn: A callable that takes a frame_length and a `dtype` keyword argument and returns a `[frame_length]` `Tensor` of samples in the provided datatype. If set to `None`, a rectangular window with a scale of 1/sqrt(2) is used. For perfect reconstruction of a signal from `mdct` followed by `inverse_mdct`, please use `tf.signal.vorbis_window`, `tf.signal.kaiser_bessel_derived_window` or `None`. If using another window function, make sure that w[n]^2 + w[n + frame_length // 2]^2 = 1 and w[n] = w[frame_length - n - 1] for n = 0,...,frame_length // 2 - 1 to achieve perfect reconstruction. norm: If "ortho", orthonormal inverse DCT4 is performed, if it is None, a regular dct4 followed by scaling of `1/frame_length` is performed. name: An optional name for the operation. Returns: A `[..., samples]` `Tensor` of `float32`/`float64` signals representing the inverse MDCT for each input MDCT in `mdcts` where `samples` is `(frames - 1) * (frame_length // 2) + frame_length`. Raises: ValueError: If `mdcts` is not at least rank 2. [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform """ with ops.name_scope(name, 'inverse_mdct', [mdcts]): mdcts = ops.convert_to_tensor(mdcts, name='mdcts') mdcts.shape.with_rank_at_least(2) half_len = math_ops.cast(mdcts.shape[-1], dtype=dtypes.int32) if norm is None: half_len_float = math_ops.cast(half_len, dtype=mdcts.dtype) result_idct4 = (0.5 / half_len_float) * dct_ops.dct(mdcts, type=4) elif norm == 'ortho': result_idct4 = dct_ops.dct(mdcts, type=4, norm='ortho') split_result = array_ops.split(result_idct4, 2, axis=-1) real_frames = array_ops.concat((split_result[1], -array_ops.reverse(split_result[1], [-1]), -array_ops.reverse(split_result[0], [-1]), -split_result[0]), axis=-1) # Optionally window and overlap-add the inner 2 dimensions of real_frames # into a single [samples] dimension. if window_fn is not None: window = window_fn(2 * half_len, dtype=mdcts.dtype) real_frames *= window else: real_frames *= 1.0 / np.sqrt(2) return reconstruction_ops.overlap_and_add(real_frames, half_len)
def _bag_features(self, tree_num, input_data): split_data = array_ops.split(1, self.params.num_features, input_data) return array_ops.concat( 1, [split_data[ind] for ind in self.params.bagged_features[tree_num]])
def mdct(signals, frame_length, window_fn=window_ops.vorbis_window, pad_end=False, norm=None, name=None): """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`. Implemented with TPU/GPU-compatible ops and supports gradients. Args: signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued signals. frame_length: An integer scalar `Tensor`. The window length in samples which must be divisible by 4. window_fn: A callable that takes a frame_length and a `dtype` keyword argument and returns a `[frame_length]` `Tensor` of samples in the provided datatype. If set to `None`, a rectangular window with a scale of 1/sqrt(2) is used. For perfect reconstruction of a signal from `mdct` followed by `inverse_mdct`, please use `tf.signal.vorbis_window`, `tf.signal.kaiser_bessel_derived_window` or `None`. If using another window function, make sure that w[n]^2 + w[n + frame_length // 2]^2 = 1 and w[n] = w[frame_length - n - 1] for n = 0,...,frame_length // 2 - 1 to achieve perfect reconstruction. pad_end: Whether to pad the end of `signals` with zeros when the provided frame length and step produces a frame that lies partially past its end. norm: If it is None, unnormalized dct4 is used, if it is "ortho" orthonormal dct4 is used. name: An optional name for the operation. Returns: A `[..., frames, frame_length // 2]` `Tensor` of `float32`/`float64` MDCT values where `frames` is roughly `samples // (frame_length // 2)` when `pad_end=False`. Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_length` is not a multiple of `4`. [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform """ with ops.name_scope(name, 'mdct', [signals, frame_length]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) # Assert that frame_length is divisible by 4. frame_length_static = tensor_util.constant_value(frame_length) if frame_length_static is not None: if frame_length_static % 4 != 0: raise ValueError('The frame length must be a multiple of 4.') frame_step = ops.convert_to_tensor(frame_length_static // 2, dtype=frame_length.dtype) else: frame_step = frame_length // 2 framed_signals = shape_ops.frame( signals, frame_length, frame_step, pad_end=pad_end) # Optionally window the framed signals. if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window else: framed_signals *= 1.0 / np.sqrt(2) split_frames = array_ops.split(framed_signals, 4, axis=-1) frame_firsthalf = -array_ops.reverse(split_frames[2], [-1]) - split_frames[3] frame_secondhalf = split_frames[0] - array_ops.reverse(split_frames[1], [-1]) frames_rearranged = array_ops.concat((frame_firsthalf, frame_secondhalf), axis=-1) # Below call produces the (frame_length // 2) unique components of the # type 4 orthonormal DCT of the real windowed signals in frames_rearranged. return dct_ops.dct(frames_rearranged, type=4, norm=norm)
def prediction_ops(self, times, values, exogenous_regressors): """Compute model predictions given input data. Args: times: A [batch size, self.window_size] integer Tensor, the first self.input_window_size times in each part of the batch indicating input features, and the last self.output_window_size times indicating prediction times. values: A [batch size, self.input_window_size, self.num_features] Tensor with input features. exogenous_regressors: A [batch size, self.window_size, self.exogenous_size] Tensor with exogenous features. Returns: Tuple (predicted_mean, predicted_covariance), where each element is a Tensor with shape [batch size, self.output_window_size, self.num_features]. """ times.get_shape().assert_is_compatible_with([None, self.window_size]) batch_size = array_ops.shape(times)[0] if self.input_window_size: values.get_shape().assert_is_compatible_with( [None, self.input_window_size, self.num_features]) if exogenous_regressors is not None: exogenous_regressors.get_shape().assert_is_compatible_with( [None, self.window_size, self.exogenous_size]) # Create input features. input_window_features = [] input_feature_size = 0 output_window_features = [] output_feature_size = 0 if self._periodicities: _, time_features = self._compute_time_features(times) num_time_features = self._buckets * len(self._periodicities) time_features = array_ops.reshape( time_features, [batch_size, self.window_size, num_time_features]) input_time_features, output_time_features = array_ops.split( time_features, (self.input_window_size, self.output_window_size), axis=1) input_feature_size += num_time_features output_feature_size += num_time_features input_window_features.append(input_time_features) output_window_features.append(output_time_features) if self.input_window_size: inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1]) input_window_features.append( array_ops.reshape( inp, [batch_size, self.input_window_size, self.num_features])) input_feature_size += self.num_features if self.exogenous_size: input_exogenous_features, output_exogenous_features = array_ops.split( exogenous_regressors, (self.input_window_size, self.output_window_size), axis=1) input_feature_size += self.exogenous_size output_feature_size += self.exogenous_size input_window_features.append(input_exogenous_features) output_window_features.append(output_exogenous_features) assert input_window_features input_window_features = array_ops.concat(input_window_features, axis=2) if output_window_features: output_window_features = array_ops.concat(output_window_features, axis=2) else: output_window_features = array_ops.zeros( [batch_size, self.output_window_size, 0], dtype=self.dtype) static_batch_size = times.get_shape().dims[0].value input_window_features.set_shape( [static_batch_size, self.input_window_size, input_feature_size]) output_window_features.set_shape( [static_batch_size, self.output_window_size, output_feature_size]) return self._output_window_predictions(input_window_features, output_window_features)
def __call__( self, inputs, # 输入包含 x 输入 和 t 的输入 state, # 状态包含了细胞状态和隐含层状态 scope=None): """ Phased long short-term memory cell (P-LSTM). """ with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. # 初始状态 state 是一个元组 ( c,h) if state is tuple: # 如果是元组的话,就可以直接分开 c_prev, h_prev = state else: # 如果不是元组的话, 那么就是多维数组 , 就在第二个维度对他们进行划分 c_prev, h_prev = array_ops.split(value=state, num_or_size_splits=2, axis=1) # (2, batch_size, seq_len) # NB: here we explicitly give t as input. # input的第一个维度长度为2 , 第一个元素是 x 的输入, 第二个元素是时间变量的输入 x = tf.reshape(inputs[:, 0], (-1, 1)) # 第二个维度的长度是1, 第一个维度根据需要摆放 # 取最后一个批次的所有的 时间戳变量 t = inputs[:, 1][ -1] # Now we only accept one id. We have a batch so it's a bit more complex. # maybe the information should come from the outside. To be defined later. # 就是矩阵乘法 concat = _linear([x, h_prev], 4 * self._num_units, True) # 这会儿还没有涉及到及激活函数 # 注意,这里只计算到线性组合的结果是有意义的 # 因为后面 可以线性组合的后面再加上窥视孔连接 # 的结果 # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) dtype = inputs.dtype # 忽然想到 , mask 是针对某一个 time_step 的, 而tau , r_on, s 以及算出来的 kt 是针对某一个 # 隐含层或者细胞状态 神经元的 tau = vs.get_variable( 'tau', shape=[self._num_units], # 为每一个隐含层神经元,细胞状态神经元 # 分配一个tau--周期, r_on开放比例, s 相位 initializer=random_exp_initializer(0, self.tau_init), dtype=dtype) r_on = vs.get_variable('r_on', shape=[self._num_units], initializer=init_ops.constant_initializer( self.r_on_init), dtype=dtype) s = vs.get_variable( 's', shape=[self._num_units], initializer=init_ops.random_uniform_initializer( 0., tau.initialized_value()), dtype=dtype) # tf.tile 的作用 是 rep times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units]) phase = phi(times, s, tau) # element-wise calculation kappa = time_gate_fast(phase, r_on, self._leak_rate, self._training_phase) w_o_peephole = None # # 如果使用了窥视孔连接的话,那么就把细胞状态的线性组合连接到前面线性组合的 if self._use_peepholes: w_i_peephole = vs.get_variable('W_I_peephole', shape=[self._num_units], dtype=dtype) w_f_peephole = vs.get_variable('W_F_peephole', shape=[self._num_units], dtype=dtype) w_o_peephole = vs.get_variable('W_O_peephole', shape=[self._num_units], dtype=dtype) f += w_f_peephole * c_prev i += w_i_peephole * c_prev new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation( j) if self._use_peepholes: o += w_o_peephole * new_c_tilde new_h_tilde = sigmoid(o) * self._activation(new_c_tilde) """ Hi all, Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c. I can add a point to the paper to mention that, and will update Figure 1 so the line is correctly drawn to c_tilde instead. The intuition here is that the gates should be blind to the effect of the khronos gate; input, forget and output gate should all operate as if the cell were a normal LSTM cell, while the khronos gate allows it to either operate or not operate (and then linearly interpolates between these two states). If the output gate is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation. I think everyone's right in that it wouldn't influence much -- but it should be updated in the paper. Thanks very much for pointing out the issue, Philippe! -Danny""" # Apply Khronos gate new_h = kappa * new_h_tilde + (1 - kappa) * h_prev new_c = kappa * new_c_tilde + (1 - kappa) * c_prev new_state = (new_c, new_h) # 根据采样频率更新 细胞状态 return new_h, new_state
def _process_input_helper(self, update_row_factors, sp_input=None, transpose_input=False, row_weights=None): """Creates the graph for processing a sparse slice of input. Args: update_row_factors: if True, update or project the row_factors, else update or project the column factors. sp_input: Please refer to comments for update_row_factors, update_col_factors, project_row_factors, and project_col_factors for restrictions. transpose_input: If True, the input is logically transposed and then the corresponding rows/columns of the transposed input are updated. row_weights: If not None, this is the row/column weights to be used for the update or projection. If None, use the corresponding weights from the model. Note that the feature (column/row) weights will be determined by the model. When not None, it can either be a scalar or a rank-1 tensor with the same number of elements as the number of rows of columns to be updated/projected. Returns: A tuple consisting of the following three elements: new_values: New values for the row/column factors. update_op: An op that assigns the newly computed values to the row/column factors. loss: A tensor (scalar) that contains the normalized minibatch loss, corresponding to sp_input. """ assert isinstance(sp_input, sparse_tensor.SparseTensor) if update_row_factors: left = self._row_factors right_factors = self._col_factors_cache row_wt = self._row_wt_cache col_wt = self._col_wt_cache total_rows = self._input_rows sharding_func = WALSModel._get_sharding_func(self._input_rows, self._num_row_shards) gramian = self._col_gramian_cache else: left = self._col_factors right_factors = self._row_factors_cache row_wt = self._col_wt_cache col_wt = self._row_wt_cache total_rows = self._input_cols sharding_func = WALSModel._get_sharding_func(self._input_cols, self._num_col_shards) gramian = self._row_gramian_cache transpose_input = not transpose_input # Note that the row indices of sp_input are based on the original full input # Here we reindex the rows and give them contiguous ids starting at 0. # We use tf.unique to achieve this reindexing. Note that this is done so # that the downstream kernel can assume that the input is "dense" along the # row dimension. row_ids, col_ids = array_ops.split( value=sp_input.indices, num_or_size_splits=2, axis=1) update_row_indices, all_row_ids = array_ops.unique(row_ids[:, 0]) update_col_indices, all_col_ids = array_ops.unique(col_ids[:, 0]) col_ids = array_ops.expand_dims(math_ops.cast(all_col_ids, dtypes.int64), 1) row_ids = array_ops.expand_dims(math_ops.cast(all_row_ids, dtypes.int64), 1) if transpose_input: update_indices = update_col_indices row_shape = [ math_ops.cast(array_ops.shape(update_row_indices)[0], dtypes.int64) ] gather_indices = update_row_indices else: update_indices = update_row_indices row_shape = [ math_ops.cast(array_ops.shape(update_col_indices)[0], dtypes.int64) ] gather_indices = update_col_indices num_rows = math_ops.cast(array_ops.shape(update_indices)[0], dtypes.int64) col_shape = [num_rows] right = embedding_ops.embedding_lookup( right_factors, gather_indices, partition_strategy="div") new_sp_indices = array_ops.concat([row_ids, col_ids], 1) new_sp_shape = (array_ops.concat([row_shape, col_shape], 0) if transpose_input else array_ops.concat([col_shape, row_shape], 0)) new_sp_input = sparse_tensor.SparseTensor( indices=new_sp_indices, values=sp_input.values, dense_shape=new_sp_shape) # Compute lhs and rhs of the normal equations total_lhs = (self._unobserved_weight * gramian) if self._regularization_matrix is not None: total_lhs += self._regularization_matrix if self._row_weights is None: # Special case of ALS. Use a much simpler update rule. total_rhs = (self._unobserved_weight * sparse_ops.sparse_tensor_dense_matmul( new_sp_input, right, adjoint_a=transpose_input)) # TODO(rmlarsen): handle transposing in tf.matrix_solve instead of # transposing explicitly. # TODO(rmlarsen): multi-thread tf.matrix_solve. new_left_values = array_ops.transpose( linalg_ops.matrix_solve(total_lhs, array_ops.transpose(total_rhs))) else: if row_weights is None: # TODO(yifanchen): Add special handling for single shard without using # embedding_lookup and perform benchmarks for those cases. Same for # col_weights lookup below. row_weights_slice = embedding_ops.embedding_lookup( row_wt, update_indices, partition_strategy="div") else: num_indices = array_ops.shape(update_indices)[0] with ops.control_dependencies( [check_ops.assert_less_equal(array_ops.rank(row_weights), 1)]): row_weights_slice = control_flow_ops.cond( math_ops.equal(array_ops.rank(row_weights), 0), lambda: (array_ops.ones([num_indices]) * row_weights), lambda: math_ops.cast(row_weights, dtypes.float32)) col_weights = embedding_ops.embedding_lookup( col_wt, gather_indices, partition_strategy="div") partial_lhs, total_rhs = ( gen_factorization_ops.wals_compute_partial_lhs_and_rhs( right, col_weights, self._unobserved_weight, row_weights_slice, new_sp_input.indices, new_sp_input.values, num_rows, transpose_input, name="wals_compute_partial_lhs_rhs")) total_lhs = array_ops.expand_dims(total_lhs, 0) + partial_lhs total_rhs = array_ops.expand_dims(total_rhs, -1) new_left_values = array_ops.squeeze( linalg_ops.matrix_solve(total_lhs, total_rhs), [2]) update_op_name = "row_update" if update_row_factors else "col_update" update_op = self.scatter_update(left, update_indices, new_left_values, sharding_func, name=update_op_name) # Create the loss subgraph loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input) if transpose_input else new_sp_input) # sp_approx is the low rank estimate of the input matrix, formed by # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices. sp_approx_vals = gen_factorization_ops.masked_matmul( new_left_values, right, loss_sp_input.indices, transpose_a=False, transpose_b=True) sp_approx = sparse_tensor.SparseTensor( loss_sp_input.indices, sp_approx_vals, loss_sp_input.dense_shape) sp_approx_sq = math_ops.square(sp_approx) sp_residual = sparse_ops.sparse_add(loss_sp_input, sp_approx * (-1)) sp_residual_sq = math_ops.square(sp_residual) row_wt_mat = (constant_op.constant(0.) if self._row_weights is None else array_ops.expand_dims(row_weights_slice, 1)) col_wt_mat = (constant_op.constant(0.) if self._col_weights is None else array_ops.expand_dims(col_weights, 0)) # We return the normalized loss partial_row_gramian = math_ops.matmul( new_left_values, new_left_values, transpose_a=True) normalization_factor = total_rows / math_ops.cast(num_rows, dtypes.float32) loss = ( self._unobserved_weight * ( sparse_ops.sparse_reduce_sum(sp_residual_sq) - sparse_ops.sparse_reduce_sum(sp_approx_sq) + math_ops.trace(math_ops.matmul(partial_row_gramian, gramian)) ) + sparse_ops.sparse_reduce_sum(row_wt_mat * (sp_residual_sq * col_wt_mat)) ) * normalization_factor if self._regularization is not None: loss += self._regularization * ( math_ops.trace(partial_row_gramian) * normalization_factor + math_ops.trace(gramian) ) return (new_left_values, update_op, loss)
def _training_examples_and_variables(): """Returns dictionaries for training examples and variables.""" batch_size = targets.get_shape()[0] # Iterate over all feature columns and create appropriate lists for dense # and sparse features as well as dense and sparse weights (variables) for # SDCA. # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables # dict as 1-dimensional tensors. dense_features, sparse_features, sparse_feature_with_values = [], [], [] dense_feature_weights = [] sparse_feature_weights, sparse_feature_with_values_weights = [], [] # pylint: disable=protected-access for column in sorted(columns_to_variables.keys(), key=lambda x: x.key): transformed_tensor = features[column] if isinstance(column, layers.feature_column._RealValuedColumn): # A real-valued column corresponds to a dense feature in SDCA. A # transformed tensor corresponding to a RealValuedColumn has rank 2 # (its shape is typically [batch_size, column.dimension]) and so it # can be passed to SDCA as is. dense_features.append(transformed_tensor) # For real valued columns, the variables list contains exactly one # element. dense_feature_weights.append(columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._BucketizedColumn): # A bucketized column corresponds to a sparse feature in SDCA. The # bucketized feature is "sparsified" for SDCA by converting it to a # SparseFeatureColumn respresenting the one-hot encoding of the # bucketized feature. dense_bucket_tensor = column.to_dnn_input_layer(transformed_tensor) sparse_feature_column = _tensor_to_sparse_feature_column( dense_bucket_tensor) sparse_feature_with_values.append(sparse_feature_column) # For bucketized columns, the variables list contains exactly one # element. sparse_feature_with_values_weights.append( columns_to_variables[column][0]) elif isinstance(column, (layers.feature_column._CrossedColumn, layers.feature_column._SparseColumn)): sparse_features.append(sdca_ops.SparseFeatureColumn( array_ops.reshape( array_ops.split(1, 2, transformed_tensor.indices)[0], [-1]), array_ops.reshape(transformed_tensor.values, [-1]), None)) sparse_feature_weights.append(columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._WeightedSparseColumn): id_tensor = column.id_tensor(transformed_tensor) weight_tensor = column.weight_tensor(transformed_tensor) sparse_feature_with_values.append(sdca_ops.SparseFeatureColumn( array_ops.reshape( array_ops.split(1, 2, id_tensor.indices)[0], [-1]), array_ops.reshape(id_tensor.values, [-1]), array_ops.reshape( weight_tensor.values, [-1]))) sparse_feature_with_values_weights.append( columns_to_variables[column][0]) else: raise ValueError('SDCAOptimizer does not support column type %s.' % type(column).__name__) # pylint: enable=protected-access example_weights = array_ops.reshape( features[weight_column_name], shape=[-1]) if weight_column_name else array_ops.ones([batch_size]) example_ids = features[self._example_id_column] sparse_feature_with_values.extend(sparse_features) sparse_feature_with_values_weights.extend(sparse_feature_weights) examples = dict(sparse_features=sparse_feature_with_values, dense_features=dense_features, example_labels=math_ops.to_float(array_ops.reshape( targets, shape=[-1])), example_weights=example_weights, example_ids=example_ids) sdca_variables = dict( sparse_features_weights=sparse_feature_with_values_weights, dense_features_weights=dense_feature_weights) return examples, sdca_variables
def call(self, inputs, state): """ Run one step of cell, Args: inputs: input Tensor, 2D, batch X num_units state: if 'state_is_tuple' is False, this must be a state Tensor, '2D, batch X state_size'. if 'state_is_tuple' is True, this must be a tuple of state Tensors, both '2D' withcolumn sizes 'c_state' and 'm_state' """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( 'Could not infer input size from inputs.get_shape()[-1]') if self._linear1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True) lstm_matrix = self._linear1([inputs, m_prev]) # i=input_gate, j=new_input, f=forget_gate, o=output_gate i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) if self._ln_i is None: self._ln_i = Layer_Normalization([self._num_units], scope='i_norm') if self._ln_j is None: self._ln_j = Layer_Normalization([self._num_units], scope='j_norm') if self._ln_f is None: self._ln_f = Layer_Normalization([self._num_units], scope='f_norm') if self._ln_o is None: self._ln_o = Layer_Normalization([self._num_units], scope='o_norm') i = self._ln_i(i) j = self._ln_j(j) f = self._ln_f(f) o = self._ln_o(o) # diagonal connections if self._use_peepholes and not self._w_f_diag: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) self._w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) self._w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._ln_p1 is None: self._ln_p1 = Layer_Normalization([self._num_units], scope='p1_norm') if self._ln_p2 is None: self._ln_p2 = Layer_Normalization([self._num_units], scope='p2_norm') if self._use_peepholes: peep1 = self._w_f_diag * c_prev #if self._ln_p1 is None: # self._ln_p1 = Layer_Normalization([self._num_units], scope='p1_norm') peep2 = self._w_i_diag * c_prev #if self._ln_p2 is None: # self._ln_p2 = Layer_Normalization([self._num_units], scope='p2_norm') c = (sigmoid(f + self._forget_bias + self._ln_p1(peep1)) + sigmoid(i + self._ln_p2(peep2)) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._ln_c is None: self._ln_c = Layer_Normalization([self._num_units], scope='c_norm') c = self._ln_c(c) if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: if self._linear2 is None: scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): with vs.variable_scope("projection") as proj_scope: self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) if self._proj_clip is not None: m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") if self._linear1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = self._linear1([inputs, m_prev]) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes and not self._w_f_diag: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) self._w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) self._w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: if self._linear2 is None: scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def loop_fn(i): x1 = array_ops.gather(x, i) return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with _checked_scope(self, scope or "gru_cell", reuse=self._reuse): with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. value = sigmoid( _linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # initialization are from the paper RECURRENT BATCH NORMALIZATION r_mean, r_var = tf.nn.moments(r, [1], name="r_moments", keep_dims=True) u_mean, u_var = tf.nn.moments(r, [1], name="u_moments", keep_dims=True) with vs.variable_scope("r_beta") as rn: try: rbeta = tf.get_variable( "rbeta", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) rgamma = tf.get_variable( "rgamma", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) except ValueError: rn.reuse_variables() rbeta = tf.get_variable( "rbeta", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) rgamma = tf.get_variable( "rgamma", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) with vs.variable_scope("u_beta") as un: try: ubeta = tf.get_variable( "ubeta", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) ugamma = tf.get_variable( "ugamma", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) except ValueError: un.reuse_variables() ubeta = tf.get_variable( "ubeta", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) ugamma = tf.get_variable( "ugamma", r.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) r = tf.nn.batch_normalization(r, r_mean, r_var, rbeta, rgamma, 0.000001) u = tf.nn.batch_normalization(u, u_mean, u_var, ubeta, ugamma, 0.000001) with vs.variable_scope("candidate"): #c = self._activation(_linear([inputs, r * state], # self._num_units, True)) c = _linear([inputs, r * state], self._num_units, True) c_mean, c_var = tf.nn.moments(r, [1], name="c_moments", keep_dims=True) with vs.variable_scope("c_beta") as cn: try: cbeta = tf.get_variable( "cbeta", c.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) cgamma = tf.get_variable( "cgamma", c.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) except ValueError: cn.reuse_variables() cbeta = tf.get_variable( "cbeta", c.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) cgamma = tf.get_variable( "cgamma", c.get_shape()[1], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) c = self._activation( tf.nn.batch_normalization(c, c_mean, c_var, cbeta, cgamma, 0.000001)) new_h = u * state + (1 - u) * c return new_h, new_h