def get_online_sequences(sequence_length, batch_size, pattern_length=10): """Gets tensors which produce new random examples every time they are evaluated. Args: sequence_length: the length of the time-lag the model has to remember the sequence for. batch_size: how many at once. pattern_length: the length of the pattern that has to be remembered and regurgitated. Returns: (data, targets): data is `[sequence_length + 2*pattern_length, batch_size, 1]`, targets are also `[sequence_length + 2*pattern_length, batch_size, 1]`. """ # first we need a pattern to remember pattern = tf.random_uniform([pattern_length, batch_size, 1], maxval=8, dtype=tf.int32) central_fillers = tf.fill([sequence_length-1, batch_size, 1], 8) go = tf.fill([1, batch_size, 1], 9) final_fillers = tf.fill([pattern_length, batch_size, 1], 8) inputs = tf.concat(axis=0, values=[pattern, central_fillers, go, final_fillers]) fillers = tf.fill([sequence_length+pattern_length, batch_size, 1], 8) targets = tf.concat(axis=0, values=[fillers, pattern]) return inputs, targets
def thresholding(inputs): # find the mean for each example in the batch mean_output = tf.reduce_mean(inputs, axis=1) # scale each mean based on a factor threshold_scalar = tf.Variable(utils.threshold_scalar, tf.float32) scaled_mean = tf.scalar_mul(threshold_scalar, mean_output) scaled_mean = tf.reshape(scaled_mean, [utils.batch_size]) # setup matrix for min_thresh_for_max = tf.fill([utils.batch_size], 0.05) max_thresh_for_min = tf.fill([utils.batch_size], 0.15) #0.4 thresholds = tf.maximum(min_thresh_for_max, scaled_mean) thresholds = tf.minimum(max_thresh_for_min, thresholds) # zero values under the thresholds using bitmask thresholds = tf.reshape(thresholds, [128, 1, 1]) threshold_mask = tf.cast(tf.greater(inputs, thresholds), tf.float32) thresholded_input = tf.multiply(inputs, threshold_mask) # peak picking # select beats by x[i-1] < x[i] > x[i+1] (local maximum) x_minus_1 = tf.cast(tf.greater(thresholded_input, tf.manip.roll(thresholded_input, shift=-1, axis=1)), tf.float32) x_plus_1 = tf.cast(tf.greater(thresholded_input, tf.manip.roll(thresholded_input, shift=1, axis=1)), tf.float32) output = tf.multiply(x_minus_1, x_plus_1) return output
def _variance(self): # We need to put the tf.where inside the outer tf.where to ensure we never # hit a NaN in the gradient. denom = tf.where(tf.greater(self.df, 2.), self.df - 2., tf.ones_like(self.df)) # Abs(scale) superfluous. var = (tf.ones(self.batch_shape_tensor(), dtype=self.dtype) * tf.square(self.scale) * self.df / denom) # When 1 < df <= 2, variance is infinite. inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype()) result_where_defined = tf.where( self.df > tf.fill(self.batch_shape_tensor(), 2.), var, tf.fill(self.batch_shape_tensor(), inf, name="inf")) if self.allow_nan_stats: nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype()) return tf.where( tf.greater( self.df, tf.ones(self.batch_shape_tensor(), dtype=self.dtype)), result_where_defined, tf.fill(self.batch_shape_tensor(), nan, name="nan")) else: return control_flow_ops.with_dependencies( [ tf.assert_less( tf.ones([], dtype=self.dtype), self.df, message="variance not defined for components of df <= 1"), ], result_where_defined)
def testParallelAssignWithLocking(self): with self.test_session() as sess: zeros_t = tf.fill([1024, 1024], 0.0) ones_t = tf.fill([1024, 1024], 1.0) p = tf.Variable(zeros_t) assigns = [tf.assign(p, tf.mul(ones_t, float(i)), use_locking=True) for i in range(1, 21)] p.initializer.run() def run_assign(assign_op): sess.run(assign_op) threads = [self.checkedThread(target=run_assign, args=(assign_op,)) for assign_op in assigns] for t in threads: t.start() for t in threads: t.join() vals = p.eval() # Assert every element is the same, and taken from one of the assignments. self.assertTrue(vals[0, 0] > 0) self.assertTrue(vals[0, 0] <= 20) self.assertAllEqual(vals, np.ones([1024, 1024]) * vals[0, 0])
def language_model(input, vocab_size): """Form p(x[0], ..., x[timesteps - 1]), \prod_{t=0}^{timesteps - 1} p(x[t] | x[:t]), To calculate the probability, we call log_prob on x = [x[0], ..., x[timesteps - 1]] given `input` = [0, x[0], ..., x[timesteps - 2]]. We implement this separately from the generative model so the forward pass, e.g., embedding/dense layers, can be parallelized. [batch_size, timesteps] -> [batch_size, timesteps] """ x = tf.one_hot(input, depth=vocab_size, dtype=tf.float32) h = tf.fill(tf.stack([tf.shape(x)[0], FLAGS.hidden_size]), 0.0) c = tf.fill(tf.stack([tf.shape(x)[0], FLAGS.hidden_size]), 0.0) hs = [] reuse = None for t in range(FLAGS.timesteps): if t > 0: reuse = True xt = x[:, t, :] h, c = lstm_cell(xt, h, c, name="lstm", reuse=reuse) hs.append(h) h = tf.stack(hs, 1) logits = tf.layers.dense(h, vocab_size, name="dense") output = Categorical(logits=logits) return output
def getLoss(trueCosSim, falseCosSim, margin): zero = tf.fill(tf.shape(trueCosSim), 0.0) tfMargin = tf.fill(tf.shape(trueCosSim), margin) with tf.name_scope("loss"): losses = tf.maximum(zero, tf.subtract(tfMargin, tf.subtract(trueCosSim, falseCosSim))) loss = tf.reduce_sum(losses) return loss
def _create_state(self, batch_size, dtype, cell_state=None): cand_symbols = tf.fill([batch_size, self.max_len], tf.constant(self.start_token, dtype=tf.int32)) cand_logprobs = tf.ones((batch_size,), dtype=tf.float32) * -float('inf') cand_symbols.set_shape([batch_size, self.max_len]) if cell_state is None: cell_state = self.cell.zero_state(batch_size*self.beam_size, dtype=dtype) else: cell_state = BeamDecoder._tile_along_beam(self.beam_size, cell_state) full_size = batch_size * self.beam_size first_in_beam_mask = tf.equal(tf.range(full_size) % self.beam_size, 0) beam_symbols = tf.fill([full_size, self.max_len], tf.constant(self.start_token, dtype=tf.int32)) beam_logprobs = tf.select( first_in_beam_mask, tf.fill([full_size], 0.0), tf.fill([full_size], -1e18), # top_k does not play well with -inf # TODO: dtype-dependent value here ) return ( cand_symbols, cand_logprobs, beam_symbols, beam_logprobs, cell_state )
def compute_ans(op_embedding, comparison): op_embedding = tf.expand_dims(op_embedding, 0) #dot product of operation embedding with hidden state to the left of the number occurrence first = tf.transpose( tf.matmul(op_embedding, tf.transpose( tf.reduce_sum(hidden_vectors * tf.tile( tf.expand_dims( tf.transpose(self.batch_ordinal_question), 2), [1, 1, self.utility.FLAGS.embedding_dims]), 0)))) second = self.batch_question_number_one_mask + tf.transpose( tf.matmul(op_embedding, tf.transpose( tf.reduce_sum(hidden_vectors * tf.tile( tf.expand_dims( tf.transpose(self.batch_ordinal_question_one), 2 ), [1, 1, self.utility.FLAGS.embedding_dims]), 0)))) question_number_softmax = tf.nn.softmax(tf.concat(axis=1, values=[first, second])) if (self.mode == "test"): cond = tf.equal(question_number_softmax, tf.reshape( tf.reduce_max(question_number_softmax, 1), [self.batch_size, 1])) question_number_softmax = tf.where( cond, tf.fill(tf.shape(question_number_softmax), 1.0), tf.fill(tf.shape(question_number_softmax), 0.0)) question_number_softmax = tf.cast(question_number_softmax, self.data_type) ans = tf.reshape( tf.reduce_sum(question_number_softmax * tf.concat( axis=1, values=[self.batch_question_number, self.batch_question_number_one]), 1), [self.batch_size, 1]) return ans
def _chain_backprop(n): """Creates forward backward graph using tf.gradients. A0->A1->A2->..->An / / / B0<-B1<-B2<-..<-Bn """ def forward(A0, n): """Takes A0, applies n operations to it, returns An.""" A = A0 for L in range(1, n+1): # op_i produces A_i A = tf.tanh(A, name="A"+str(L)) return A def backward(A0, An, Bn, n): B0 = tf.gradients([An], [A0], grad_ys=[Bn])[0] return B0 A0 = tf.fill((size,), 1.0, name="A0") An = forward(A0, n) Bn = tf.fill((size,), 1.0, name="Bn") B0 = tf.gradients([An], [A0], grad_ys=[Bn])[0] return B0
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE with tf.variable_scope("linear-transform"): weight = tf.Variable(tf.fill([self.config.n_features,self.config.n_classes],0.0)) bias = tf.Variable(tf.fill([self.config.n_classes],0.0)) z = tf.matmul(input_data,weight) + bias out = softmax(z) ### END YOUR CODE return out
def testInitRequiredAssignAdd(self): with self.test_session(): p = tf.Variable(tf.fill([1024, 1024], 1), tf.int32) a = tf.assign_add(p, tf.fill([1024, 1024], 0)) with self.assertRaisesOpError("use uninitialized"): a.op.run()
def make_hard_softmax(self, softmax): #converts soft selection to hard selection. used at test time cond = tf.equal( softmax, tf.reshape(tf.reduce_max(softmax, 1), [self.batch_size, 1])) softmax = tf.where( cond, tf.fill(tf.shape(softmax), 1.0), tf.fill(tf.shape(softmax), 0.0)) softmax = tf.cast(softmax, self.data_type) return softmax
def LSTMBiasInit(shape, dtype): """Returns ones for forget-gate, and zeros for the others.""" shape = np.array(shape) # Check internal consistencies. assert shape.shape == (1,), shape assert shape[0] % 4 == 0, shape n = shape[0] // 4 ones = tf.fill([n], tf.constant(1, dtype=dtype)) zeros = tf.fill([3 * n], tf.constant(0, dtype=dtype)) return tf.concat([ones, zeros], 0)
def testFillNegative(self): with self.test_session(): for shape in (-1,), (2, -1), (-1, 2): with self.assertRaises(ValueError): tf.fill(shape, 7) # Using a placeholder so this won't be caught in Python. dims = tf.placeholder(tf.int32) fill_t = tf.fill(dims, 3.0) for shape in (-1,), (2, -1), (-1, 2): with self.assertRaises(tf.errors.InvalidArgumentError): fill_t.eval({dims: shape})
def testShapeFunctionEdgeCases(self): # Non-vector dimensions. with self.assertRaises(ValueError): tf.fill([[0, 1], [2, 3]], 1.0) # Non-scalar value. with self.assertRaises(ValueError): tf.fill([3, 2], [1.0, 2.0]) # Partial dimension information. f = tf.fill(tf.placeholder(tf.int32, shape=(4,)), 3.0) self.assertEqual([None, None, None, None], f.get_shape().as_list())
def testAssignNonStrictShapeChecking(self): with self.test_session(): data = tf.fill([1024, 1024], 0) p = tf.Variable([1]) a = tf.assign(p, data, validate_shape=False) a.op.run() self.assertAllEqual(p.eval(), data.eval()) # Assign to yet another shape data2 = tf.fill([10, 10], 1) a2 = tf.assign(p, data2, validate_shape=False) a2.op.run() self.assertAllEqual(p.eval(), data2.eval())
def testInitialStateComputation(self, tuple_state, mask): if tuple_state: initial_state = (tf.fill([BATCH_SIZE, 6], 2), (tf.fill([BATCH_SIZE, 7], 3), tf.fill([BATCH_SIZE, 8], 4))) else: initial_state = tf.fill([BATCH_SIZE, 9], 10) trainable_state_module = snt.TrainableInitialState(initial_state, mask=mask) trainable_state = trainable_state_module() flat_trainable_state = nest.flatten(trainable_state) nest.assert_same_structure(initial_state, trainable_state) flat_initial_state = nest.flatten(initial_state) if mask is not None: flat_mask = nest.flatten(mask) else: flat_mask = (True,) * len(flat_initial_state) self.evaluate(tf.global_variables_initializer()) # Check all variables are initialized correctly and return a state that # has the same as it is provided. for trainable_state, initial_state in zip(flat_trainable_state, flat_initial_state): self.assertAllEqual( self.evaluate(trainable_state), self.evaluate(initial_state)) # Change the value of all the trainable variables to ones. for variable in tf.trainable_variables(): self.evaluate(tf.assign(variable, tf.ones_like(variable))) # In eager mode to re-evaluate the module we must re-connect it. trainable_state = trainable_state_module() flat_trainable_state = nest.flatten(trainable_state) # Check that the values of the initial_states have changed if and only if # they are trainable. for trainable_state, initial_state, mask in zip(flat_trainable_state, flat_initial_state, flat_mask): trainable_state_value = self.evaluate(trainable_state) initial_state_value = self.evaluate(initial_state) if mask: expected_value = np.ones_like(initial_state_value) else: expected_value = initial_state_value self.assertAllEqual(trainable_state_value, expected_value)
def rnn_decoder(decoder_inputs, initial_state, cell, word_dropout_keep_prob=1, replace_inp=None, loop_function=None, scope=None): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None seq_len = len(decoder_inputs) keep = tf.select(tf.random_uniform([seq_len]) < word_dropout_keep_prob, tf.fill([seq_len], True), tf.fill([seq_len], False)) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): if word_dropout_keep_prob < 1: inp = tf.cond(keep[i], lambda: loop_function(prev, i), lambda: replace_inp) else: inp = loop_function(prev, i) if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(inp, state) outputs.append(output) if loop_function is not None: prev = output return outputs, state
def grad(grad_ys): large_float_like_x = np.sqrt(np.finfo(x.dtype.as_numpy_dtype()).max) safe_grads = tf.where( tf.equal(x, 0), tf.fill(x.shape, large_float_like_x), 0.5 * tf.rsqrt(x)) return grad_ys * safe_grads
def calculate_reshape(original_shape, new_shape, validate=False, name=None): """Calculates the reshaped dimensions (replacing up to one -1 in reshape).""" batch_shape_static = tensor_util.constant_value_as_shape(new_shape) if batch_shape_static.is_fully_defined(): return np.int32(batch_shape_static.as_list()), batch_shape_static, [] with tf.name_scope(name, "calculate_reshape", [original_shape, new_shape]): original_size = tf.reduce_prod(original_shape) implicit_dim = tf.equal(new_shape, -1) size_implicit_dim = ( original_size // tf.maximum(1, -tf.reduce_prod(new_shape))) new_ndims = tf.shape(new_shape) expanded_new_shape = tf.where( # Assumes exactly one `-1`. implicit_dim, tf.fill(new_ndims, size_implicit_dim), new_shape) validations = [] if not validate else [ tf.assert_rank( original_shape, 1, message="Original shape must be a vector."), tf.assert_rank(new_shape, 1, message="New shape must be a vector."), tf.assert_less_equal( tf.count_nonzero(implicit_dim, dtype=tf.int32), 1, message="At most one dimension can be unknown."), tf.assert_positive( expanded_new_shape, message="Shape elements must be >=-1."), tf.assert_equal( tf.reduce_prod(expanded_new_shape), original_size, message="Shape sizes do not match."), ] return expanded_new_shape, batch_shape_static, validations
def BatchClipByL2norm(t, upper_bound, name=None): """Clip an array of tensors by L2 norm. Shrink each dimension-0 slice of tensor (for matrix it is each row) such that the l2 norm is at most upper_bound. Here we clip each row as it corresponds to each example in the batch. Args: t: the input tensor. upper_bound: the upperbound of the L2 norm. name: optional name. Returns: the clipped tensor. """ assert upper_bound > 0 with tf.op_scope([t, upper_bound], name, "batch_clip_by_l2norm") as name: saved_shape = tf.shape(t) batch_size = tf.slice(saved_shape, [0], [1]) t2 = tf.reshape(t, tf.concat(0, [batch_size, [-1]])) upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]), tf.constant(1.0/upper_bound)) # Add a small number to avoid divide by 0 l2norm_inv = tf.rsqrt(tf.reduce_sum(t2 * t2, [1]) + 0.000001) scale = tf.minimum(l2norm_inv, upper_bound_inv) * upper_bound clipped_t = tf.matmul(tf.diag(scale), t2) clipped_t = tf.reshape(clipped_t, saved_shape, name=name) return clipped_t
def testLargeFetch(self): server = tf.train.Server.create_local_server() with tf.Session(server.target) as sess: c = tf.fill([10000, 3000], 0.5) expected_val = np.empty([10000, 3000], dtype=np.float32) expected_val.fill(0.5) self.assertAllEqual(expected_val, sess.run(c))
def get_variable(constraint): if constraint is None: i = next(index) return inputs[:, i:i+1] else: return tf.fill(constant_shape, tf.constant(constraint, dtype=inputs.dtype))
def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False): """Run an LSTM, either forward or backward. This is a 1D LSTM implementation using dynamic_rnn and the TensorFlow LSTM op. Args: inputs: input sequence (length, batch_size, ninput) noutput: depth of output scope: optional scope name reverse: run LSTM in reverse Returns: Output sequence (length, batch_size, noutput) """ with tf.variable_scope(scope, "SeqLstm", [inputs]): # TODO(tmb) make batch size, sequence_length dynamic # example: sequence_length = tf.shape(inputs)[0] _, batch_size, _ = _shape(inputs) lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False) state = tf.zeros([batch_size, lstm_cell.state_size]) sequence_length = int(inputs.get_shape()[0]) sequence_lengths = tf.to_int64(tf.fill([batch_size], sequence_length)) if reverse: inputs = tf.reverse(inputs, [True, False, False]) outputs, _ = tf.nn.dynamic_rnn(lstm_cell, inputs, sequence_lengths, state, time_major=True) if reverse: outputs = tf.reverse(outputs, [True, False, False]) return outputs
def init_memory(N, W, R): """ returns the initial values of the memory matrix, usage vector, precedence vector, link matrix, read weightings, write weightings, and the read vectors """ M0 = tf.fill([N, W], 1e-6) u0 = tf.zeros([N]) p0 = tf.zeros([N]) L0 = tf.zeros([N, N]) wr0 = tf.fill([N, R], 1e-6) # initial read weightings ww0 = tf.fill([N], 1e-6) # initial write weightings r0 = tf.fill([W, R], 1e-6) # initial read vector return M0, u0, p0, L0, wr0, ww0, r0
def testDtype(self): with self.test_session(): d = tf.fill([2, 3], 12., name="fill") self.assertEqual(d.get_shape(), [2, 3]) # Test default type for both constant size and dynamic size z = tf.ones([2, 3]) self.assertEqual(z.dtype, tf.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) z = tf.ones(tf.shape(d)) self.assertEqual(z.dtype, tf.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) # Test explicit type control for dtype in (tf.float32, tf.float64, tf.int32, tf.uint8, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.int64, tf.bool): z = tf.ones([2, 3], dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) z = tf.ones(tf.shape(d), dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3]))
def K(self, X, X2=None): if X2 is None: d = tf.fill(tf.pack([tf.shape(X)[0]]), tf.squeeze(self.variance)) return tf.diag(d) else: shape = tf.pack([tf.shape(X)[0], tf.shape(X2)[0]]) return tf.zeros(shape, tf.float64)
def set_logp_to_neg_inf(X, logp, bounds): """Set `logp` to negative infinity when `X` is outside the allowed bounds. # Arguments X: tensorflow.Tensor The variable to apply the bounds to logp: tensorflow.Tensor The log probability corrosponding to `X` bounds: list of `Region` objects The regions corrosponding to allowed regions of `X` # Returns logp: tensorflow.Tensor The newly bounded log probability """ conditions = [] for l, u in bounds: lower_is_neg_inf = not isinstance(l, tf.Tensor) and np.isneginf(l) upper_is_pos_inf = not isinstance(u, tf.Tensor) and np.isposinf(u) if not lower_is_neg_inf and upper_is_pos_inf: conditions.append(tf.greater(X, l)) elif lower_is_neg_inf and not upper_is_pos_inf: conditions.append(tf.less(X, u)) elif not (lower_is_neg_inf or upper_is_pos_inf): conditions.append(tf.logical_and(tf.greater(X, l), tf.less(X, u))) if len(conditions) > 0: is_inside_bounds = conditions[0] for condition in conditions[1:]: is_inside_bounds = tf.logical_or(is_inside_bounds, condition) logp = tf.select(is_inside_bounds, logp, tf.fill(tf.shape(X), config.dtype(-np.inf))) return logp
def conv_relu(self, policy_input, target_input, kernel_shape, stride, layer_num): ''' Build a convolutional layer Args: input_layer: input to convolutional layer - must be 4d target_input: input to layer of target network - must also be 4d kernel_shape: tuple for filter shape: (filter_height, filter_width, in_channels, out_channels) stride: tuple for stride: (1, vert_stride. horiz_stride, 1) ''' name = 'conv' + str(layer_num + 1) with tf.variable_scope(name): # fan_in = tf.reduce_prod(tf.slice(policy_input.get_shape(), [1], [-1])) weights = tf.Variable(tf.truncated_normal(kernel_shape, stddev=0.01), name=(name + "_weights")) # weights = self.get_weights(kernel_shape, fan_in, name + "_weights") biases = tf.Variable(tf.fill([kernel_shape[-1]], 0.1), name=(name + "_biases")) # biases = self.get_biases([kernel_shape[-1]], fan_in, name + "_biases") activation = tf.nn.relu(tf.nn.conv2d(policy_input, weights, stride, 'VALID') + biases) target_weights = tf.Variable(weights.initialized_value(), trainable=False, name=("target_" + name + "_weights")) target_biases = tf.Variable(biases.initialized_value(), trainable=False, name=("target_" + name + "_biases")) target_activation = tf.nn.relu(tf.nn.conv2d(target_input, target_weights, stride, 'VALID') + target_biases) self.update_target.append(target_weights.assign(weights)) self.update_target.append(target_biases.assign(biases)) self.policy_network_params.append(weights) self.policy_network_params.append(biases) self.param_names.append(name + "_weights") self.param_names.append(name + "_biases") return [activation, target_activation]
def dense_linear(self, policy_input, target_input, shape): ''' Build the fully-connected linear output layer Args: input_layer: last hidden layer target_input: last hidden layer of target network shape: tuple for weight shape (num_input_nodes, num_actions) ''' name = 'q_layer' with tf.variable_scope(name): # fan_in = tf.reduce_prod(tf.slice(policy_input.get_shape(), [1], [-1])) weights = tf.Variable(tf.truncated_normal(shape, stddev=0.01), name=(name + "_weights")) # weights = self.get_weights(shape, fan_in, name + "_weights") biases = tf.Variable(tf.fill([shape[-1]], 0.1), name=(name + "_biases")) # biases = self.get_biases([shape[-1]], fan_in, name + "_biases") activation = tf.matmul(policy_input, weights) + biases target_weights = tf.Variable(weights.initialized_value(), trainable=False, name=("target_" + name + "_weights")) target_biases = tf.Variable(biases.initialized_value(), trainable=False, name=("target_" + name + "_biases")) target_activation = tf.matmul(target_input, target_weights) + target_biases self.update_target.append(target_weights.assign(weights)) self.update_target.append(target_biases.assign(biases)) self.policy_network_params.append(weights) self.policy_network_params.append(biases) self.param_names.append(name + "_weights") self.param_names.append(name + "_biases") return [activation, target_activation]
def quadrature_scheme_lognormal_quantiles(loc, scale, quadrature_size, validate_args=False, name=None): """Use LogNormal quantiles to form quadrature on positive-reals. Args: loc: `float`-like (batch of) scalar `Tensor`; the location parameter of the LogNormal prior. scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of the LogNormal prior. quadrature_size: Python `int` scalar representing the number of quadrature points. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. name: Python `str` name prefixed to Ops created by this class. Returns: grid: (Batch of) length-`quadrature_size` vectors representing the `log_rate` parameters of a `Poisson`. probs: (Batch of) length-`quadrature_size` vectors representing the weight associate with each `grid` value. """ with tf.name_scope(name, "quadrature_scheme_lognormal_quantiles", [loc, scale]): # Create a LogNormal distribution. dist = transformed_lib.TransformedDistribution( distribution=tf.distributions.Normal(loc=loc, scale=scale), bijector=Exp(), validate_args=validate_args) batch_ndims = dist.batch_shape.ndims if batch_ndims is None: batch_ndims = tf.shape(dist.batch_shape_tensor())[0] def _compute_quantiles(): """Helper to build quantiles.""" # Omit {0, 1} since they might lead to Inf/NaN. zero = tf.zeros([], dtype=dist.dtype) edges = tf.linspace(zero, 1., quadrature_size + 3)[1:-1] # Expand edges so its broadcast across batch dims. edges = tf.reshape( edges, shape=tf.concat( [[-1], tf.ones([batch_ndims], dtype=tf.int32)], axis=0)) quantiles = dist.quantile(edges) # Cyclically permute left by one. perm = tf.concat([tf.range(1, 1 + batch_ndims), [0]], axis=0) quantiles = tf.transpose(quantiles, perm) return quantiles quantiles = _compute_quantiles() # Compute grid as quantile midpoints. grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2. # Set shape hints. grid.set_shape(dist.batch_shape.concatenate([quadrature_size])) # By construction probs is constant, i.e., `1 / quadrature_size`. This is # important, because non-constant probs leads to non-reparameterizable # samples. probs = tf.fill(dims=[quadrature_size], value=1. / tf.cast(quadrature_size, dist.dtype)) return grid, probs
def __init__(self, hparams, input_tensor, label_tensor, is_train): self.num_classes = hparams.n self.batch_size = hparams.batch_size self.seq_len = hparams.seq_len self.input_dim = hparams.input_dim self.num_gcn_blocks = hparams.num_gcn_blocks self.lr = hparams.lr self.hop = hparams.hop self.label_cut = hparams.label_cut # self.input_placeholder = tf.nn.l2_normalize(tf.cast(input_tensor, tf.float32),axis=-1) self.input_placeholder = tf.cast(input_tensor, tf.float32) self.label_placeholder = label_tensor self.is_train = is_train if self.is_train: self.global_step = tf.get_variable("global_step", initializer=0, trainable=False) else: self.global_step = None feed_label, target_label = tf.split(self.label_placeholder, [self.seq_len - 1, 1], axis=1) self.target_label = tf.reshape(target_label, shape=[-1]) # self.target_label = target_label # self.target_label=tf.one_hot(self.target_label,depth=self.num_classes,dtype=tf.float32) feed_label_one_hot_without_target = tf.one_hot(feed_label, depth=self.num_classes, dtype=tf.float32) self.feed_label_one_hot_with_target = tf.concat([ feed_label_one_hot_without_target, tf.fill([self.batch_size, 1, self.num_classes], 1.0 / self.num_classes) ], axis=1) self.concated_input = tf.concat( [self.input_placeholder, self.feed_label_one_hot_with_target], axis=2) data_store = self.input_placeholder label_store = self.feed_label_one_hot_with_target '''for test only''' # name = 'GCN_Blocks' # with tf.variable_scope(name): # data_store, _, self.diff,label_store, propagation_store ,self.Lap,self.simi,self.cmpr= self._gcn_block(input_data=data_store,input_label=label_store,add_dim=self.num_classes, drop=False) for i in range(self.num_gcn_blocks): #是否公用相似度函数和感受野比例 # name='GCN_Blocks' name = f"GCN_Block_{i}" with tf.variable_scope(name): _, data_store, label_store, _ = self._gcn_block( input_data=data_store, input_label=label_store, add_dim=int(self.input_dim / 2)) with tf.variable_scope('last_Block'): data_store, _, label_store, propagation_store = self._gcn_block( input_data=data_store, input_label=label_store, add_dim=self.num_classes) self.label_store = label_store if self.label_cut == 'yes': print('use cut') self.predict_label = label_store[:, -1, :] elif self.label_cut == 'no': self.predict_label = data_store[:, -1, :] else: self.predict_label = self._add_nn_block( x=tf.concat([data_store[:, -1, :], label_store[:, -1, :]], axis=-1), out_channel=self.num_classes) self.propagation = propagation_store ce_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.target_label, logits=self.predict_label)) self.loss = ce_loss self.train_step = tf.train.AdamOptimizer(self.lr).minimize( self.loss, global_step=self.global_step) self.accuracy = self._calc_accuracy()
import numpy as np # Square matrix A of rank 2 A = tf.constant([[1., 2.], [3., 4.]]) # 2x2 Square, Diagonal, Symmetric matrix B B = tf.diag([5., 6.]) # 2x2 Square matrix C = tf.constant([[1., 2.], [2., 4.]]) # 2x1 vector will all elements equal to 1 x = tf.ones([2, 1]) # 2x1 vector will all elements equal to 2.0 b = tf.fill([2, 1], 2.) # 2x1 vector y = tf.constant([[-1.], [1.]]) # run within a session and print with tf.Session() as session: print("Tensorflow version: " + tf.__version__) tf.global_variables_initializer().run() print("A = ") print(A.eval()) print("B = ") print(B.eval())
def beam_loop(self, time, cell_output, cell_state, loop_state): ( past_cand_symbols, # [batch_size, time-1] past_cand_logprobs, # [batch_size] past_beam_symbols, # [batch_size*beam_size, time-1], right-aligned past_beam_logprobs, # [batch_size*beam_size] ) = loop_state # We don't actually use this, but emit_output is required to match the # cell output size specfication. Otherwise we would leave this as None. emit_output = cell_output # 1. Get scores for all candidate sequences logprobs = self.outputs_to_score_fn(cell_output) try: num_classes = int(logprobs.get_shape()[-1]) except: # Shape inference failed num_classes = tf.shape(logprobs)[-1] logprobs_batched = tf.reshape( logprobs + tf.expand_dims( tf.reshape(past_beam_logprobs, [self.batch_size, self.beam_size]), 2), [self.batch_size, self.beam_size * num_classes]) # 2. Determine which states to pass to next iteration # TODO(nikita): consider using slice+fill+concat instead of adding a mask nondone_mask = tf.reshape( tf.cast(tf.equal(tf.range(num_classes), self.stop_token), tf.float32) * self.INVALID_SCORE, [1, 1, num_classes]) nondone_mask = tf.reshape( tf.tile(nondone_mask, [1, self.beam_size, 1]), [-1, self.beam_size * num_classes]) beam_logprobs, indices = tf.nn.top_k(logprobs_batched + nondone_mask, self.beam_size) beam_logprobs = tf.reshape(beam_logprobs, [-1]) # For continuing to the next symbols symbols = indices % num_classes # [batch_size, self.beam_size] parent_refs = indices // num_classes # [batch_size, self.beam_size] symbols_history = flat_batch_gather(past_beam_symbols, parent_refs, batch_size=self.batch_size, options_size=self.beam_size) beam_symbols = tf.concat( 1, [symbols_history, tf.reshape(symbols, [-1, 1])]) # Handle the output and the cell state shuffling next_cell_state = nest_map( lambda element: batch_gather(element, parent_refs, batch_size=self.batch_size, options_size=self.beam_size), cell_state) next_input = self.tokens_to_inputs_fn( tf.reshape(symbols, [-1, self.beam_size])) # 3. Update the candidate pool to include entries that just ended with a stop token logprobs_done = tf.reshape( logprobs_batched, [-1, self.beam_size, num_classes])[:, :, self.stop_token] done_parent_refs = tf.argmax(logprobs_done, 1) done_symbols = flat_batch_gather(past_beam_symbols, done_parent_refs, batch_size=self.batch_size, options_size=self.beam_size) logprobs_done_max = tf.reduce_max(logprobs_done, 1) cand_symbols_unpadded = tf.select( logprobs_done_max > past_cand_logprobs, done_symbols, past_cand_symbols) cand_logprobs = tf.maximum(logprobs_done_max, past_cand_logprobs) cand_symbols = tf.concat(1, [ cand_symbols_unpadded, tf.fill([self.batch_size, 1], self.stop_token) ]) # 4. Check the stopping criteria if self.max_len is not None: elements_finished_clip = (time >= self.max_len) if self.score_upper_bound is not None: elements_finished_bound = tf.reduce_max( tf.reshape(beam_logprobs, [-1, self.beam_size]), 1) < (cand_logprobs - self.score_upper_bound) if self.max_len is not None and self.score_upper_bound is not None: elements_finished = elements_finished_clip | elements_finished_bound elif self.score_upper_bound is not None: elements_finished = elements_finished_bound elif self.max_len is not None: # this broadcasts elements_finished_clip to the correct shape elements_finished = tf.zeros( [self.batch_size], dtype=tf.bool) | elements_finished_clip else: assert False, "Lack of stopping criterion should have been caught in constructor" # 5. Prepare return values # While loops require strict shape invariants, so we manually set shapes # in case the automatic shape inference can't calculate these. Even when # this is redundant is has the benefit of helping catch shape bugs. for tensor in list(nest.flatten(next_input)) + list( nest.flatten(next_cell_state)): tensor.set_shape( tf.TensorShape( (self.inferred_batch_size, self.beam_size)).concatenate(tensor.get_shape()[2:])) for tensor in [cand_symbols, cand_logprobs, elements_finished]: tensor.set_shape( tf.TensorShape((self.inferred_batch_size, )).concatenate( tensor.get_shape()[1:])) for tensor in [beam_symbols, beam_logprobs]: tensor.set_shape( tf.TensorShape( (self.inferred_batch_size_times_beam_size, )).concatenate( tensor.get_shape()[1:])) next_loop_state = ( cand_symbols, cand_logprobs, beam_symbols, beam_logprobs, ) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)
def _build_model(self): # CNN with tf.variable_scope('cnn'): x = self.inputs filters = [1, 64, 128, 128, self._out_channels] for i in range(self._cnn_count): with tf.variable_scope('unit-%d' % (i + 1)): x = self._conv2d(x, 'cnn-%d' % (i + 1), 3, filters[i], filters[i + 1], strides=1) x = self._batch_norm(is_train=self._is_train, name='bn%d' % (i + 1), x=x) x = self._leaky_relu(x) x = self._max_pool(x, 2, strides=2) pass _, feature_h, feature_w, _ = x.get_shape().as_list() print('\nfeature_h: {}, feature_w: {}'.format( feature_h, feature_w)) pass # 一维数据,长度为batch_size,值为feature_w # 表示每个数据的time_step长度 self.seq_len = tf.fill([self._batch_size], feature_w) # LSTM with tf.variable_scope('lstm'): x = tf.transpose(x, [0, 2, 1, 3]) x = tf.reshape( x, [self._batch_size, feature_w, feature_h * self._out_channels]) print('lstm input shape: {}'.format(x.get_shape().as_list())) cell = tf.nn.rnn_cell.LSTMCell(self._num_hidden, state_is_tuple=True) cell1 = tf.nn.rnn_cell.LSTMCell(self._num_hidden, state_is_tuple=True) if self._is_train: cell = tf.nn.rnn_cell.DropoutWrapper( cell=cell, output_keep_prob=self._output_keep_prob) cell1 = tf.nn.rnn_cell.DropoutWrapper( cell=cell1, output_keep_prob=self._output_keep_prob) # Stacking rnn cells stack = tf.nn.rnn_cell.MultiRNNCell([cell, cell1], state_is_tuple=True) initial_state = stack.zero_state(self._batch_size, dtype=tf.float32) # The second output is the last state and we will not use that outputs, _ = tf.nn.dynamic_rnn(cell=stack, inputs=x, sequence_length=self.seq_len, initial_state=initial_state, dtype=tf.float32, time_major=False) pass outputs = tf.reshape(outputs, [-1, self._num_hidden]) w = tf.get_variable('W_out', [self._num_hidden, self._num_classes], tf.float32, tf.glorot_uniform_initializer()) b = tf.get_variable('b_out', shape=[self._num_classes], dtype=tf.float32, initializer=tf.constant_initializer()) self.logits = tf.add(tf.matmul(outputs, w), b) self.logits = tf.reshape(self.logits, [tf.shape(x)[0], -1, self._num_classes]) # Time major self.logits = tf.transpose(self.logits, (1, 0, 2)) pass
def test_fill(self): # computation f = tf.fill([2, 3], 5) # test self.run(f)
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob if hparams.use_cudnn: outputs, initial_state, final_state = make_cudnn( expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode, dropout_keep_prob=dropout_keep_prob, residual_connections=hparams.residual_connections) else: cell = make_rnn_cell( hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = contrib_layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[:, logits_offsets[i]: logits_offsets[i + 1]])) predictions.append( tf.argmax( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float( tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float( tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps( labels[:length]) return np.float32(num_steps) num_steps = tf.py_func(batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum( softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = contrib_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy( labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in six.iteritems(vars_to_summarize): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf_nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf_nest.flatten(final_state): tf.add_to_collection('final_state', state)
def _get_initial_state(self, batch_size): return tf.fill([batch_size], 0)
def decode_outputs(self, target_words_vocab, target_input, batch_size, batched_contexts, valid_mask, is_evaluating=False): num_contexts_per_example = tf.count_nonzero(valid_mask, axis=-1) start_fill = tf.fill([batch_size], self.target_to_index[Common.SOS]) # (batch, ) decoder_cell = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.LSTMCell(self.config.DECODER_SIZE) for _ in range(self.config.NUM_DECODER_LAYERS) ]) contexts_sum = tf.reduce_sum( batched_contexts * tf.expand_dims(valid_mask, -1), axis=1) # (batch_size, dim * 2 + rnn_size) contexts_average = tf.divide( contexts_sum, tf.to_float(tf.expand_dims(num_contexts_per_example, -1))) fake_encoder_state = tuple( tf.nn.rnn_cell.LSTMStateTuple(contexts_average, contexts_average) for _ in range(self.config.NUM_DECODER_LAYERS)) projection_layer = tf.layers.Dense(self.target_vocab_size, use_bias=False) if is_evaluating and self.config.BEAM_WIDTH > 0: batched_contexts = tf.contrib.seq2seq.tile_batch( batched_contexts, multiplier=self.config.BEAM_WIDTH) num_contexts_per_example = tf.contrib.seq2seq.tile_batch( num_contexts_per_example, multiplier=self.config.BEAM_WIDTH) attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units=self.config.DECODER_SIZE, memory=batched_contexts) # TF doesn't support beam search with alignment history should_save_alignment_history = is_evaluating and self.config.BEAM_WIDTH == 0 decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=self.config.DECODER_SIZE, alignment_history=should_save_alignment_history) if is_evaluating: if self.config.BEAM_WIDTH > 0: decoder_initial_state = decoder_cell.zero_state( dtype=tf.float32, batch_size=batch_size * self.config.BEAM_WIDTH) decoder_initial_state = decoder_initial_state.clone( cell_state=tf.contrib.seq2seq.tile_batch( fake_encoder_state, multiplier=self.config.BEAM_WIDTH)) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=target_words_vocab, start_tokens=start_fill, end_token=self.target_to_index[Common.PAD], initial_state=decoder_initial_state, beam_width=self.config.BEAM_WIDTH, output_layer=projection_layer, length_penalty_weight=0.0) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( target_words_vocab, start_fill, 0) initial_state = decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=fake_encoder_state) decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=projection_layer) else: decoder_cell = tf.nn.rnn_cell.DropoutWrapper( decoder_cell, output_keep_prob=self.config.RNN_DROPOUT_KEEP_PROB) target_words_embedding = tf.nn.embedding_lookup( target_words_vocab, tf.concat( [tf.expand_dims(start_fill, -1), target_input], axis=-1)) # (batch, max_target_parts, dim * 2 + rnn_size) helper = tf.contrib.seq2seq.TrainingHelper( inputs=target_words_embedding, sequence_length=tf.ones([batch_size], dtype=tf.int32) * (self.config.MAX_TARGET_PARTS + 1)) initial_state = decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=fake_encoder_state) decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=projection_layer) outputs, final_states, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=self.config.MAX_TARGET_PARTS + 1) return outputs, final_states
def decoder_with_caching(self, encoded, len_encoded): """ gread search, used for self-learning training or infer """ batch_size = tf.shape(encoded)[0] token_init = tf.fill([batch_size, 1], self.start_token) logits_init = tf.zeros([batch_size, 1, self.dim_output], dtype=tf.float32) finished_init = tf.zeros([batch_size], dtype=tf.bool) len_decoded_init = tf.ones([batch_size], dtype=tf.int32) cache_decoder_init = tf.zeros( [batch_size, 0, self.num_blocks, self.num_cell_units]) encoder_padding = tf.equal( tf.sequence_mask(len_encoded, maxlen=tf.shape(encoded)[1]), False) # bool tensor encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) def step(i, preds, cache_decoder, logits, len_decoded, finished): preds_emb = self.embedding(preds) decoder_input = preds_emb decoder_output, cache_decoder = self.decoder_with_caching_impl( decoder_input, cache_decoder, encoded, encoder_attention_bias) cur_logit = tf.layers.dense(inputs=decoder_output[:, -1, :], units=self.dim_output, activation=None, use_bias=False, name='decoder_fc') cur_ids = tf.to_int32(tf.argmax(cur_logit, -1)) preds = tf.concat([preds, cur_ids[:, None]], axis=1) logits = tf.concat([logits, cur_logit[:, None]], 1) # Whether sequences finished. has_eos = tf.equal(cur_ids, self.end_token) finished = tf.logical_or(finished, has_eos) len_decoded += 1 - tf.to_int32(finished) return i + 1, preds, cache_decoder, logits, len_decoded, finished def not_finished(i, preds, cache, logit, len_decoded, finished): return tf.logical_and( tf.reduce_any(tf.logical_not(finished)), tf.less( i, tf.reduce_min([tf.shape(encoded)[1], self.args.max_len]) # maxlen = 25 )) i, preds, cache_decoder, logits, len_decoded, finished = tf.while_loop( cond=not_finished, body=step, loop_vars=[ 0, token_init, cache_decoder_init, logits_init, len_decoded_init, finished_init ], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None, None]), tf.TensorShape([None, None, self.dim_output]), tf.TensorShape([None]), tf.TensorShape([None]) ]) # len_decoded = tf.Print(len_decoded, [finished], message='finished: ', summarize=1000) len_decoded -= 1 - tf.to_int32( finished) # for decoded length cut by encoded length logits = logits[:, 1:, :] preds = preds[:, 1:] not_padding = tf.sequence_mask(len_decoded, dtype=tf.int32) preds = tf.multiply(tf.to_int32(preds), not_padding) return logits, preds, len_decoded
def beam_decode_rerank(self, encoded, len_encoded): """ beam search rerank at end with language model integration (self-attention model) the input to te score is <sos> + tokens !!! """ beam_size = self.beam_size batch_size = tf.shape(len_encoded)[0] # beam search Initialize # repeat each sample in batch along the batch axis [1,2,3,4] -> [1,1,2,2,3,3,4,4] encoded = tf.tile(encoded[:, None, :, :], multiples=[ 1, beam_size, 1, 1 ]) # [batch_size, beam_size, *, hidden_units] encoded = tf.reshape( encoded, [batch_size * beam_size, -1, encoded.get_shape()[-1].value]) len_encoded = tf.reshape( tf.tile(len_encoded[:, None], multiples=[1, beam_size]), [-1]) # [batch_size * beam_size] # [[<S>, <S>, ..., <S>]], shape: [batch_size * beam_size, 1] token_init = tf.fill([batch_size * beam_size, 1], self.args.sos_idx) logits_init = tf.zeros([batch_size * beam_size, 0, self.dim_output], dtype=tf.float32) len_decoded_init = tf.ones_like(len_encoded, dtype=tf.int32) # the score must be [0, -inf, -inf, ...] at init, for the preds in beam is same in init!!! scores_init = tf.constant([0.0] + [-inf] * (beam_size - 1), dtype=tf.float32) # [beam_size] scores_init = tf.tile(scores_init, multiples=[batch_size ]) # [batch_size * beam_size] finished_init = tf.zeros_like(scores_init, dtype=tf.bool) cache_decoder_init = tf.zeros( [batch_size * beam_size, 0, self.num_blocks, self.num_cell_units]) if self.lm: cache_lm_init = tf.zeros([ batch_size * beam_size, 0, self.lm.args.model.decoder.num_blocks, self.lm.args.model.decoder.num_cell_units ]) else: cache_lm_init = tf.zeros([0, 0, 0, 0]) # collect the initial states of lstms used in decoder. base_indices = tf.reshape(tf.tile(tf.range(batch_size)[:, None], multiples=[1, beam_size]), shape=[-1]) encoder_padding = tf.equal( tf.sequence_mask(len_encoded, maxlen=tf.shape(encoded)[1]), False) # bool tensor encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) def step(i, preds, scores, cache_decoder, cache_lm, logits, len_decoded, finished): """ the cache has no specific shape, so no can be put in the all_states """ preds_emb = self.embedding(preds) decoder_input = preds_emb decoder_output, cache_decoder = self.decoder_with_caching_impl( decoder_input, cache_decoder, encoded, encoder_attention_bias) cur_logit = tf.layers.dense(inputs=decoder_output[:, -1, :], units=self.dim_output, activation=None, use_bias=False, name='decoder_fc') logits = tf.concat([logits, cur_logit[:, None]], 1) z = tf.nn.log_softmax(cur_logit) # [batch*beam, size_output] # the langueage model infer if self.args.model.shallow_fusion: assert self.lm preds_emb = self.lm.decoder.embedding(preds) with tf.variable_scope(self.args.top_scope, reuse=True): with tf.variable_scope(self.args.lm_scope): lm_output, cache_lm = self.lm.decoder.decoder_with_caching_impl( preds_emb, cache_lm) logit_lm = dense(inputs=lm_output[:, -1, :], units=self.dim_output, kernel=tf.transpose( self.lm.decoder.fully_connected), use_bias=False) z_lm = self.lambda_lm * tf.nn.log_softmax( logit_lm) # [batch*beam, size_output] else: z_lm = tf.zeros_like(z) # rank the combined scores next_scores, next_preds = tf.nn.top_k(z + z_lm, k=beam_size, sorted=True) next_preds = tf.to_int32(next_preds) # beamed scores & Pruning scores = scores[:, None] + next_scores # [batch_size * beam_size, beam_size] scores = tf.reshape(scores, shape=[batch_size, beam_size * beam_size]) _, k_indices = tf.nn.top_k(scores, k=beam_size) k_indices = base_indices * beam_size * beam_size + tf.reshape( k_indices, shape=[-1]) # [batch_size * beam_size] # Update scores. scores = tf.reshape(scores, [-1]) scores = tf.gather(scores, k_indices) # Update predictions. next_preds = tf.reshape(next_preds, shape=[-1]) next_preds = tf.gather(next_preds, indices=k_indices) # k_indices: [0~batch*beam*beam], preds: [0~batch*beam] # preds, cache_lm, cache_decoder: these data are shared during the beam expand among vocab preds = tf.gather(preds, indices=k_indices // beam_size) cache_lm = tf.gather(cache_lm, indices=k_indices // beam_size) cache_decoder = tf.gather(cache_decoder, indices=k_indices // beam_size) preds = tf.concat([preds, next_preds[:, None]], axis=1) # [batch_size * beam_size, i] has_eos = tf.equal(next_preds, self.end_token) finished = tf.logical_or(finished, has_eos) len_decoded += 1 - tf.to_int32(finished) # i = tf.Print(i, [i], message='i: ', summarize=1000) return i + 1, preds, scores, cache_decoder, cache_lm, logits, len_decoded, finished def not_finished(i, preds, scores, cache_decoder, cache_lm, logit, len_decoded, finished): # i = tf.Print(i, [i], message='i: ', summarize=1000) return tf.logical_and( tf.reduce_any(tf.logical_not(finished)), tf.less( i, tf.reduce_min([tf.shape(encoded)[1], self.args.max_len]) # maxlen = 100 )) _, preds, scores_am, _, _, logits, len_decoded, finished = tf.while_loop( cond=not_finished, body=step, loop_vars=[ 0, token_init, scores_init, cache_decoder_init, cache_lm_init, logits_init, len_decoded_init, finished_init ], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None]), tf.TensorShape([None]), tf.TensorShape([None, None, None, None]), tf.TensorShape([None, None, None, None]), tf.TensorShape([None, None, self.dim_output]), tf.TensorShape([None]), tf.TensorShape([None]) ]) # [batch_size * beam_size, ...] len_decoded -= 1 - tf.to_int32( finished) # for decoded length cut by encoded length preds = preds[:, 1:] not_padding = tf.sequence_mask(len_decoded, dtype=tf.int32) preds *= not_padding # [batch_size , beam_size, ...] if self.args.model.rerank: assert self.lm with tf.variable_scope(self.args.top_scope, reuse=True): with tf.variable_scope(self.args.lm_scope): scores_lm, distribution = self.lm.decoder.score( preds, len_decoded) scores_lm = self.args.lambda_rerank * scores_lm else: scores_lm = tf.zeros_like(scores_am) scores = scores_am + scores_lm # tf.nn.top_k is used to sort `scores` scores_sorted, sorted = tf.nn.top_k(tf.reshape( scores, [batch_size, beam_size]), k=beam_size, sorted=True) sorted = base_indices * beam_size + tf.reshape( sorted, shape=[-1]) # [batch_size * beam_size] # [batch_size * beam_size, ...] logits_sorted = tf.gather(logits, sorted) preds_sorted = tf.gather(preds, sorted) len_decoded_sorted = tf.gather(len_decoded, sorted) scores_lm_sorted = tf.gather(scores_lm, sorted) scores_am_sorted = tf.gather(scores_am, sorted) # [batch_size, beam_size, ...] scores_lm_sorted = tf.reshape(scores_lm_sorted, shape=[batch_size, beam_size]) scores_am_sorted = tf.reshape(scores_am_sorted, shape=[batch_size, beam_size]) preds_sorted = tf.reshape( preds_sorted, shape=[batch_size, beam_size, -1]) # [batch_size, beam_size, max_length] logits_sorted = tf.reshape( logits_sorted, [batch_size, beam_size, -1, self.dim_output]) len_decoded_sorted = tf.reshape(len_decoded_sorted, [batch_size, beam_size]) # return logits, final_preds, len_encoded return [ logits_sorted, preds_sorted, len_decoded_sorted, scores_am_sorted, scores_lm_sorted ], preds_sorted[:, 0, :], len_decoded_sorted[:, 0]
def __init__(self, source_vocab_size, target_vocab_size, buckets, text_hidden_size, speech_hidden_size, parse_hidden_size, text_num_layers, speech_num_layers, parse_num_layers, embedding_size, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer, use_lstm=True, output_keep_prob=0.8, num_samples=512, forward_only=False): """Create the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.epoch = 0 self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. def create_cell(hidden_size, num_layers): single_cell = tf.nn.rnn_cell.GRUCell(hidden_size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple=True) #single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) if not forward_only: # always use dropout; set keep_prob=1 if not dropout print("Training mode; dropout used!") single_cell = tf.nn.rnn_cell.DropoutWrapper( single_cell, output_keep_prob=output_keep_prob) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True) #cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) return cell text_cell = create_cell(text_hidden_size, text_num_layers) speech_cell = create_cell(speech_hidden_size, speech_num_layers) parse_cell = create_cell(parse_hidden_size, parse_num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs_list, decoder_inputs, text_len, do_decode, attn_vec_size): return many2one_seq2seq.many2one_attention_seq2seq( encoder_inputs_list, decoder_inputs, text_len, text_cell, speech_cell, parse_cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, output_projection=output_projection, feed_previous=do_decode, attention_vec_size=attn_vec_size) # Feeds for inputs. #self.encoder_inputs = [] self.text_encoder_inputs = [] self.speech_encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.text_encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="text_encoder{0}".format(i))) for i in xrange(buckets[-1][0] * spscale): self.speech_encoder_inputs.append( tf.placeholder(tf.float32, shape=[None, mfcc_num], name="speech_encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_inputs_list = [ self.text_encoder_inputs, self.speech_encoder_inputs ] # seq_len stuff: _batch_size = tf.shape(self.text_encoder_inputs[0])[0] self.seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64)) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.seq_len, buckets, lambda x, y, z: seq2seq_f(x, y, z, True, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=spscale) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.seq_len, buckets, lambda x, y, z: seq2seq_f(x, y, z, False, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=spscale) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] #opt = tf.train.AdagradOptimizer(self.learning_rate) ## Make optimizer a hyperparameter if optimizer == "momentum": opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) elif optimizer == "grad_descent": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif optimizer == "adagrad": print("Using adagrad optimizer") opt = tf.train.AdagradOptimizer(self.learning_rate) else: print("Using Adam optimizer") opt = tf.train.AdamOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def _build(self, obs_input, act_input, name=None): return_var = tf.compat.v1.get_variable( 'return_var', (), initializer=tf.constant_initializer(0.5)) return tf.fill((tf.shape(obs_input)[0], self.output_dim), return_var)
def build_graph(mode, config, sequence_example_file_paths=None): """Builds the TensorFlow graph. Args: mode: 'train', 'eval', or 'generate'. Only mode related ops are added to the graph. config: An EventSequenceRnnConfig containing the encoder/decoder and HParams to use. sequence_example_file_paths: A list of paths to TFRecord files containing tf.train.SequenceExample protos. Only needed for training and evaluation. May be a sharded file of the form. Returns: A tf.Graph instance which contains the TF ops. Raises: ValueError: If mode is not 'train', 'eval', or 'generate'. """ if mode not in ('train', 'eval', 'generate'): raise ValueError("The mode parameter must be 'train', 'eval', " "or 'generate'. The mode parameter was: %s" % mode) hparams = config.hparams encoder_decoder = config.encoder_decoder tf.logging.info('hparams = %s', hparams.values()) input_size = encoder_decoder.input_size num_classes = encoder_decoder.num_classes no_event_label = encoder_decoder.default_event_label with tf.Graph().as_default() as graph: inputs, labels, lengths, = None, None, None state_is_tuple = True if mode == 'train' or mode == 'eval': inputs, labels, lengths, ids = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size) tf.add_to_collection('ids', ids) elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) # If state_is_tuple is True, the output RNN cell state will be a tuple # instead of a tensor. During training and evaluation this improves # performance. However, during generation, the RNN cell state is fed # back into the graph with a feed dict. Feed dicts require passed in # values to be tensors and not tuples, so state_is_tuple is set to False. state_is_tuple = False if config.learn_initial_state: state_is_tuple = False cell = make_rnn_cell(hparams.rnn_layer_sizes, dropout_keep_prob=hparams.dropout_keep_prob, attn_length=hparams.attn_length, state_is_tuple=state_is_tuple) # Old: use zero if not config.learn_initial_state or mode == 'generate': initial_state = cell.zero_state(hparams.batch_size, tf.float32) # Learn initial state, complex variable/placeholder construction else: initial_state_size = cell.zero_state(hparams.batch_size, tf.float32).get_shape() initial_state_in = tf.placeholder(tf.float32, shape=initial_state_size) initial_state = tf.Variable(initial_state_in, tf.float32) tf.add_to_collection('initial_state_size', initial_state_size.as_list()) tf.add_to_collection('initial_state_in', initial_state_in) tf.add_to_collection('initial_state', initial_state) tf.add_to_collection('initial_state_init', tf.variables_initializer([initial_state])) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, lengths, initial_state, parallel_iterations=1, swap_memory=True) outputs_flat = tf.reshape(outputs, [-1, hparams.rnn_layer_sizes[-1]]) logits_flat = tf.contrib.layers.linear(outputs_flat, num_classes) if mode == 'train' or mode == 'eval': if hparams.skip_first_n_losses: logits = tf.reshape(logits_flat, [hparams.batch_size, -1, num_classes]) logits = logits[:, hparams.skip_first_n_losses:, :] logits_flat = tf.reshape(logits, [-1, num_classes]) labels = labels[:, hparams.skip_first_n_losses:] labels_flat = tf.reshape(labels, [-1]) softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.reduce_mean(tf.exp(softmax_cross_entropy)) correct_predictions = tf.to_float( tf.nn.in_top_k(logits_flat, labels_flat, 1)) accuracy = tf.reduce_mean(correct_predictions) * 100 event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label)) event_accuracy = tf.truediv( tf.reduce_sum(tf.multiply(correct_predictions, event_positions)), tf.reduce_sum(event_positions)) * 100 no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label)) no_event_accuracy = tf.truediv( tf.reduce_sum(tf.multiply(correct_predictions, no_event_positions)), tf.reduce_sum(no_event_positions)) * 100 global_step = tf.Variable(0, trainable=False, name='global_step') tf.add_to_collection('loss', loss) tf.add_to_collection('perplexity', perplexity) tf.add_to_collection('accuracy', accuracy) tf.add_to_collection('global_step', global_step) summaries = [ tf.summary.scalar('loss', loss), tf.summary.scalar('perplexity', perplexity), tf.summary.scalar('accuracy', accuracy), tf.summary.scalar( 'event_accuracy', event_accuracy), tf.summary.scalar( 'no_event_accuracy', no_event_accuracy), ] if mode == 'train': learning_rate = tf.train.exponential_decay( hparams.initial_learning_rate, global_step, hparams.decay_steps, hparams.decay_rate, staircase=True, name='learning_rate') opt = tf.train.AdamOptimizer(learning_rate) params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, hparams.clip_norm) train_op = opt.apply_gradients(zip(clipped_gradients, params), global_step) tf.add_to_collection('learning_rate', learning_rate) tf.add_to_collection('train_op', train_op) m = tf.placeholder(tf.float32) v = tf.placeholder(tf.float32) assign_m = opt.get_slot(initial_state, 'm').assign(m) assign_v = opt.get_slot(initial_state, 'v').assign(v) read_m = opt.get_slot(initial_state, 'm') read_v = opt.get_slot(initial_state, 'v') tf.add_to_collection('m', m) tf.add_to_collection('v', v) tf.add_to_collection('assign_m', assign_m) tf.add_to_collection('assign_v', assign_v) tf.add_to_collection('read_m', read_m) tf.add_to_collection('read_v', read_v) summaries.append(tf.summary.scalar( 'learning_rate', learning_rate)) if mode == 'eval': summary_op = tf.summary.merge(summaries) tf.add_to_collection('summary_op', summary_op) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) tf.add_to_collection('inputs', inputs) tf.add_to_collection('initial_state', initial_state) tf.add_to_collection('final_state', final_state) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) init_op = tf.global_variables_initializer() tf.add_to_collection('init_op', init_op) return graph
def _decode(self, input_dict): """Decodes representation into data. Args: input_dict (dict): Python dictionary with inputs to decoder. Config parameters: * **src_inputs** --- Decoder input Tensor of shape [batch_size, time, dim] or [time, batch_size, dim]. * **src_lengths** --- Decoder input lengths Tensor of shape [batch_size] * **tgt_inputs** --- Only during training. labels Tensor of the shape [batch_size, time] or [time, batch_size]. * **tgt_lengths** --- Only during training. labels lengths Tensor of the shape [batch_size]. Returns: dict: Python dictionary with: * outputs - [predictions, alignments, enc_src_lengths]. predictions are the final predictions of the model. tensor of shape [batch_size, time]. alignments are the attention probabilities if attention is used. None if 'plot_attention' in attention_params is set to False. enc_src_lengths are the lengths of the input. tensor of shape [batch_size]. * logits - logits with the shape=[batch_size, output_dim]. * tgt_length - tensor of shape [batch_size] indicating the predicted sequence lengths. """ encoder_outputs = input_dict['encoder_output']['outputs'] enc_src_lengths = input_dict['encoder_output']['src_length'] self._batch_size = int(encoder_outputs.get_shape()[0]) self._beam_width = self.params.get("beam_width", 1) tgt_inputs = None tgt_lengths = None if 'target_tensors' in input_dict: tgt_inputs = input_dict['target_tensors'][0] tgt_lengths = input_dict['target_tensors'][1] tgt_inputs = tf.concat([ tf.fill([self._batch_size, 1], self.GO_SYMBOL), tgt_inputs[:, :-1] ], -1) layer_type = self.params['rnn_type'] num_layers = self.params['num_layers'] attention_params = self.params['attention_params'] hidden_dim = self.params['hidden_dim'] dropout_keep_prob = self.params.get( 'dropout_keep_prob', 1.0) if self._mode == "train" else 1.0 # To-Do Seperate encoder and decoder position embeddings use_positional_embedding = self.params.get("pos_embedding", False) use_language_model = self.params.get("use_language_model", False) use_beam_search_decoder = (self._beam_width != 1) and (self._mode == "infer") self._target_emb_layer = tf.get_variable( name='TargetEmbeddingMatrix', shape=[self._tgt_vocab_size, self._tgt_emb_size], dtype=tf.float32, ) if use_positional_embedding: self.enc_pos_emb_size = int(encoder_outputs.get_shape()[-1]) self.enc_pos_emb_layer = tf.get_variable( name='EncoderPositionEmbeddingMatrix', shape=[1024, self.enc_pos_emb_size], dtype=tf.float32, ) encoder_output_positions = tf.range(0, tf.shape(encoder_outputs)[1], delta=1, dtype=tf.int32, name='positional_inputs') encoder_position_embeddings = tf.cast(tf.nn.embedding_lookup( self.enc_pos_emb_layer, encoder_output_positions), dtype=encoder_outputs.dtype) encoder_outputs += encoder_position_embeddings self.dec_pos_emb_size = self._tgt_emb_size self.dec_pos_emb_layer = tf.get_variable( name='DecoderPositionEmbeddingMatrix', shape=[1024, self.dec_pos_emb_size], dtype=tf.float32, ) output_projection_layer = FullyConnected( [self._tgt_vocab_size], dropout_keep_prob=dropout_keep_prob, mode=self._mode, ) rnn_cell = cells_dict[layer_type] dropout = tf.nn.rnn_cell.DropoutWrapper multirnn_cell = tf.nn.rnn_cell.MultiRNNCell([ dropout(rnn_cell(hidden_dim), output_keep_prob=dropout_keep_prob) for _ in range(num_layers) ]) if use_beam_search_decoder: encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self._beam_width, ) enc_src_lengths = tf.contrib.seq2seq.tile_batch( enc_src_lengths, multiplier=self._beam_width, ) attention_dim = attention_params["attention_dim"] attention_type = attention_params["attention_type"] num_heads = attention_params["num_heads"] plot_attention = attention_params["plot_attention"] if plot_attention: if use_beam_search_decoder: plot_attention = False print( "Plotting Attention is disabled for Beam Search Decoding") if num_heads != 1: plot_attention = False print( "Plotting Attention is disabled for Multi Head Attention") if self.params['dtype'] != tf.float32: plot_attention = False print( "Plotting Attention is disabled for Mixed Precision Mode") attention_params_dict = {} if attention_type == "bahadanu": AttentionMechanism = BahdanauAttention attention_params_dict["normalize"] = False, elif attention_type == "chorowski": AttentionMechanism = LocationSensitiveAttention attention_params_dict["use_coverage"] = attention_params[ "use_coverage"] attention_params_dict["location_attn_type"] = attention_type attention_params_dict["location_attention_params"] = { 'filters': 10, 'kernel_size': 101 } elif attention_type == "zhaopeng": AttentionMechanism = LocationSensitiveAttention attention_params_dict["use_coverage"] = attention_params[ "use_coverage"] attention_params_dict["query_dim"] = hidden_dim attention_params_dict["location_attn_type"] = attention_type attention_mechanism = [] for head in range(num_heads): attention_mechanism.append( AttentionMechanism(num_units=attention_dim, memory=encoder_outputs, memory_sequence_length=enc_src_lengths, probability_fn=tf.nn.softmax, dtype=tf.get_variable_scope().dtype, **attention_params_dict)) multirnn_cell_with_attention = AttentionWrapper( cell=multirnn_cell, attention_mechanism=attention_mechanism, attention_layer_size=[hidden_dim for i in range(num_heads)], output_attention=True, alignment_history=plot_attention, ) if self._mode == "train": decoder_output_positions = tf.range(0, tf.shape(tgt_inputs)[1], delta=1, dtype=tf.int32, name='positional_inputs') tgt_input_vectors = tf.nn.embedding_lookup(self._target_emb_layer, tgt_inputs) if use_positional_embedding: tgt_input_vectors += tf.nn.embedding_lookup( self.dec_pos_emb_layer, decoder_output_positions) tgt_input_vectors = tf.cast( tgt_input_vectors, dtype=self.params['dtype'], ) # helper = tf.contrib.seq2seq.TrainingHelper( helper = TrainingHelper( inputs=tgt_input_vectors, sequence_length=tgt_lengths, ) elif self._mode == "infer" or self._mode == "eval": embedding_fn = lambda ids: tf.cast( tf.nn.embedding_lookup(self._target_emb_layer, ids), dtype=self.params['dtype'], ) pos_embedding_fn = None if use_positional_embedding: pos_embedding_fn = lambda ids: tf.cast( tf.nn.embedding_lookup(self.dec_pos_emb_layer, ids), dtype=self.params['dtype'], ) # helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( helper = GreedyEmbeddingHelper( embedding=embedding_fn, start_tokens=tf.fill([self._batch_size], self.GO_SYMBOL), end_token=self.END_SYMBOL, positional_embedding=pos_embedding_fn) if self._mode != "infer": maximum_iterations = tf.reduce_max(tgt_lengths) else: maximum_iterations = tf.reduce_max(enc_src_lengths) if not use_beam_search_decoder: decoder = tf.contrib.seq2seq.BasicDecoder( cell=multirnn_cell_with_attention, helper=helper, initial_state=multirnn_cell_with_attention.zero_state( batch_size=self._batch_size, dtype=encoder_outputs.dtype, ), output_layer=output_projection_layer, ) else: batch_size_tensor = tf.constant(self._batch_size) decoder = BeamSearchDecoder( cell=multirnn_cell_with_attention, embedding=embedding_fn, start_tokens=tf.tile([self.GO_SYMBOL], [self._batch_size]), end_token=self.END_SYMBOL, initial_state=multirnn_cell_with_attention.zero_state( dtype=encoder_outputs.dtype, batch_size=batch_size_tensor * self._beam_width, ), beam_width=self._beam_width, output_layer=output_projection_layer, length_penalty_weight=0.0, ) final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, impute_finished=self.mode != "infer", maximum_iterations=maximum_iterations, ) if plot_attention: alignments = tf.transpose(final_state.alignment_history[0].stack(), [1, 0, 2]) else: alignments = None if not use_beam_search_decoder: outputs = tf.argmax(final_outputs.rnn_output, axis=-1) logits = final_outputs.rnn_output return_outputs = [outputs, alignments, enc_src_lengths] else: outputs = final_outputs.predicted_ids[:, :, 0] logits = final_outputs.predicted_ids[:, :, 0] return_outputs = [outputs, enc_src_lengths] if self.mode == "eval": max_len = tf.reduce_max(tgt_lengths) logits = tf.while_loop( lambda logits: max_len > tf.shape(logits)[1], lambda logits: tf.concat([ logits, tf.fill([tf.shape(logits)[0], 1, tf.shape(logits)[2]], tf.cast(1.0, self.params['dtype'])) ], 1), loop_vars=[logits], back_prop=False, ) return { 'outputs': return_outputs, 'logits': logits, 'tgt_length': final_sequence_lengths, }
def _build_decoder(self, encoder_outputs, encoder_state, hparams): """Build and run a RNN decoder with a final projection layer. Args: encoder_outputs: The outputs of encoder for every time step. encoder_state: The final state of the encoder. hparams: The Hyperparameters configurations. Returns: A tuple of final logits and final decoder state: logits: size [time, batch_size, vocab_size] when time_major=True. """ tgt_sos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.sos)), tf.int32) tgt_eos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.eos)), tf.int32) num_layers = hparams.num_layers num_gpus = hparams.num_gpus iterator = self.iterator # maximum_iteration: The maximum decoding steps. maximum_iterations = self._get_infer_maximum_iterations( hparams, iterator.source_sequence_length) ## Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = self._build_decoder_cell( hparams, encoder_outputs, encoder_state, iterator.source_sequence_length) ## Train or eval if self.mode != tf.contrib.learn.ModeKeys.INFER: # decoder_emp_inp: [max_time, batch_size, num_units] target_input = iterator.target_input if self.time_major: target_input = tf.transpose(target_input) decoder_emb_inp = tf.reshape(target_input, [ self.get_max_time(target_input), hparams.batch_size, hparams.num_units ]) #tf.nn.embedding_lookup( #self.embedding_decoder, target_input) # Helper helper = tf.contrib.seq2seq.TrainingHelper( decoder_emb_inp, iterator.target_sequence_length, time_major=self.time_major) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) sample_id = outputs.sample_id # Note: there's a subtle difference here between train and inference. # We could have set output_layer when create my_decoder # and shared more code between train and inference. # We chose to apply the output_layer to all timesteps for speed: # 10% improvements for small models & 20% for larger ones. # If memory is a concern, we should apply output_layer per timestep. device_id = num_layers if num_layers < num_gpus else ( num_layers - 1) with tf.device(model_helper.get_device_str( device_id, num_gpus)): logits = self.output_layer(outputs.rnn_output) ## Inference else: beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id if beam_width > 0: my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=self.embedding_decoder, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=self.output_layer, length_penalty_weight=length_penalty_weight) else: # Helper helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding_decoder, start_tokens, end_token) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, output_layer=self.output_layer # applied per timestep ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=maximum_iterations, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) if beam_width > 0: logits = tf.no_op() sample_id = outputs.predicted_ids else: logits = outputs.rnn_output sample_id = outputs.sample_id return logits, sample_id, final_context_state
def custom_loss(y_true, y_pred): mask_shape = tf.shape(y_true)[:5] cell_x = tf.to_float( tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1))) # ridiculous equivalent of np.repeat cell_y = tf.reshape( tf.tile(tf.reshape(tf.range(GRID_H), [-1, 1]), [1, GRID_W]), [-1]) # tile and reshape in same way as cell_x cell_y = tf.to_float(tf.reshape(cell_y, (1, GRID_H, GRID_W, 1, 1))) # combine to give grid cell_grid = tf.tile(tf.concat([cell_x, cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1]) seen = tf.Variable(0.) """ Adjust Predictions """ # adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) # new line convert to whole image pred_box_xy_wi = pred_box_xy + cell_grid pred_box_xy_wi = tf.divide(pred_box_xy_wi, [GRID_W, GRID_H]) # adjust w and h pred_box_wh = y_pred[..., 2:4] # new line adjust so relative to whole image pred_box_wh_wi = tf.exp(y_pred[..., 2:4]) * tf.reshape( ANCHORS, [1, 1, 1, BOX, 2]) pred_box_wh_wi = tf.divide(pred_box_wh_wi, [GRID_W, GRID_H]) # adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) # adjust class probabilities pred_box_class = tf.sigmoid(y_pred[..., 5]) """ Adjust ground truth for just cells with a centre of a ground truth """ # adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell # add new line give relative to whole image true_box_xy_wi = tf.divide(tf.add(true_box_xy, cell_grid), [GRID_W, GRID_H]) # get w and h true_box_wh_wi = y_true[..., 2:4] # adjust w and h true_box_wh = tf.multiply(true_box_wh_wi, [GRID_W, GRID_H]) true_box_wh = true_box_wh / tf.reshape(ANCHORS, [1, 1, 1, BOX, 2]) true_box_wh = tf.log(true_box_wh + 0.00001) # the + 0.00001 takes out zeros which can't be logged these should then be multiplied by zero again later # adjust confidence true_wh_half = true_box_wh_wi / 2. true_mins = true_box_xy_wi - true_wh_half true_maxes = true_box_xy_wi + true_wh_half pred_wh_half = pred_box_wh_wi / 2. pred_mins = pred_box_xy_wi - pred_wh_half pred_maxes = pred_box_xy_wi + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh_wi[..., 0] * true_box_wh_wi[..., 1] pred_areas = pred_box_wh_wi[..., 0] * pred_box_wh_wi[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) """ Calculate IOU with any truth """ # confidence mask: penalize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = true_boxes[..., 0:2] true_xy_wi = tf.divide(tf.add(true_xy, tf.expand_dims(cell_grid, axis=4)), [GRID_W, GRID_H]) true_wh_wi = true_boxes[..., 2:4] true_wh_half2 = true_wh_wi / 2. true_mins2 = true_xy_wi - true_wh_half2 true_maxes2 = true_xy_wi + true_wh_half2 pred_xy_wi = tf.expand_dims(pred_box_xy_wi, 4) pred_wh_wi = tf.expand_dims(pred_box_wh_wi, 4) pred_wh_half2 = pred_wh_wi / 2. pred_mins2 = pred_xy_wi - pred_wh_half2 pred_maxes2 = pred_xy_wi + pred_wh_half2 intersect_mins2 = tf.maximum(pred_mins2, true_mins2) intersect_maxes2 = tf.minimum(pred_maxes2, true_maxes2) intersect_wh2 = tf.maximum(intersect_maxes2 - intersect_mins2, 0.) intersect_areas2 = intersect_wh2[..., 0] * intersect_wh2[..., 1] true_areas2 = true_wh_wi[..., 0] * true_wh_wi[..., 1] pred_areas2 = pred_wh_wi[..., 0] * pred_wh_wi[..., 1] union_areas2 = pred_areas2 + true_areas2 - intersect_areas2 iou_scores_all = tf.truediv(intersect_areas2, union_areas2) best_ious = tf.reduce_max(iou_scores_all, axis=4) # create masks ones and no ones noones = tf.to_float(best_ious < NO_OBJ_THRESHOLD) ones = y_true[..., 4] """ Warm-up training """ seen = tf.assign_add(seen, 1.) warm_xy = tf.fill(mask_shape, 0.5) warm_xy = warm_xy[..., 0:2] warm_wh = tf.fill(mask_shape, 0.) warm_wh = warm_wh[..., 2:4] warm_no = tf.fill(mask_shape[0:4], 1.) true_box_xy, true_box_wh, coord_scale, coord_mask = tf.cond( tf.less(seen, WARM_UP_BATCHES), lambda: [warm_xy, warm_wh, 0.01, warm_no], lambda: [true_box_xy, true_box_wh, COORD_SCALE, ones]) """ Finalize the loss """ loss_conf = tf.sqrt( tf.reduce_sum( tf.square((iou_scores - pred_box_conf) * ones * OBJECT_SCALE))) loss_noconf = tf.sqrt( tf.reduce_sum( tf.square((0. - pred_box_conf) * noones * NO_OBJECT_SCALE))) loss_class = tf.sqrt( tf.reduce_sum(tf.square((1. - pred_box_class) * ones * CLASS_SCALE))) coord_mask = tf.expand_dims(coord_mask, axis=-1) loss_xy = tf.sqrt( tf.reduce_sum( tf.square((true_box_xy - pred_box_xy) * coord_mask * COORD_SCALE))) loss_wh = tf.sqrt( tf.reduce_sum( tf.square((true_box_wh - pred_box_wh) * coord_mask * COORD_SCALE))) loss_all = loss_xy + loss_wh + loss_conf + loss_class + loss_noconf loss = tf.square(loss_all) """ Debugging code """ # test1 = pred_box_conf test2 = tf.reduce_max(pred_box_xy) test3 = tf.reduce_max(true_box_xy) loss = tf.Print(loss, [test2, test3], message='\t') loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_noconf], message='Loss No Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) return loss
def __init__(self): # 1. 定义输入 self.X = tf.placeholder(tf.int32, [None, None]) self.Y = tf.placeholder(tf.int32, [None, None]) self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) batch_size = tf.shape(self.X)[0] main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1) # 解码的输入加起始标志 # initializer = tf.initializers.random_normal(stddev=0.1) logits = tf.reduce_mean(model_GPT2.model(params, self.X)['logits'], axis=1) state_proj = tf.layers.Dense(params.n_embd) init_state = state_proj(logits) # 构造解码的初始状态 # 词嵌入 embedding = tf.Variable(tf.random_uniform([len(id2vocab_to), params.n_embd], -1, 1)) cell = tf.nn.rnn_cell.LSTMCell(params.n_embd) vocab_proj = tf.layers.Dense(len(id2vocab_to)) # 解码 helper = tf.contrib.seq2seq.TrainingHelper( inputs=tf.nn.embedding_lookup(embedding, decoder_input), sequence_length=tf.to_int32(self.Y_seq_len) ) encoder_state = tf.nn.rnn_cell.LSTMStateTuple(c=init_state, h=init_state) decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell, helper=helper, initial_state=encoder_state, output_layer=vocab_proj ) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=tf.reduce_max(self.Y_seq_len) ) # 推理 # 贪婪搜索 helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=embedding, start_tokens=tf.tile( tf.constant([GO], dtype=tf.int32), [tf.shape(init_state)[0]] ), end_token=EOS ) decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell, helper=helper, initial_state=encoder_state, output_layer=vocab_proj ) predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=2*tf.reduce_max(self.X_seq_len) ) self.training_logits = decoder_output.rnn_output self.predicting_ids = predicting_decoder_output.sample_id self.logits = decoder_output.sample_id masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32) self.cost = tf.contrib.seq2seq.sequence_loss( logits=self.training_logits, targets=self.Y, weights=masks ) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) y_t = tf.argmax(self.training_logits, axis=2) y_t = tf.cast(y_t, tf.int32) self.prediction = tf.boolean_mask(y_t, masks) mask_label = tf.boolean_mask(self.Y, masks) correct_pred = tf.equal(self.prediction, mask_label) correct_index = tf.cast(correct_pred, tf.float32) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if not head_mask is None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) embedding_output = self.embeddings( [input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder( [embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) # add hidden_states and attentions if they are here outputs = ( sequence_output, pooled_output, ) + encoder_outputs[1:] # sequence_output, pooled_output, (hidden_states), (attentions) return outputs
hparams.max_gradient_norm) # Optimization optimizer = tf.train.AdamOptimizer(hparams.learning_rate) train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step) #optimizer = tf.train.GradientDescentOptimizer(hparams.learning_rate) #train_op = optimizer.minimize(loss, global_step=global_step) else: # source_sequence_length = hparams.encoder_length # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embedding_decoder, start_tokens=tf.fill([hparams.batch_size], tgt_sos_id), end_token=tgt_eos_id, initial_state=initial_state, beam_width=hparams.beam_width, output_layer=projection_layer, length_penalty_weight=0.0) # Dynamic decoding outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, maximum_iterations=10) translations = outputs.predicted_ids #%% # Tweet with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver()
import matplotlib.pyplot as plt import tensorflow as tf sess = tf.Session() x_vals = tf.linspace(-3., 5, 500) target = tf.constant(1.) targets = tf.fill([ 500, ], 1.) # Hinge Loss hinge_y_vals = tf.maximum(0., 1. - tf.multiply(target, x_vals)) hing_y_out = sess.run(hinge_y_vals) # Cross-entropy Loss xentropy_y_vals = -tf.multiply(target, tf.log(x_vals)) - tf.multiply( (1. - target), tf.log(1. - x_vals)) xentropy_y_out = sess.run(xentropy_y_vals) # Sigmoid cross entropy xentropy_sigmoid_y_vals = tf.nn.sigmoid_cross_entropy_with_logits( logits=x_vals, labels=targets) xentropy_sigmoid_y_out = sess.run(xentropy_sigmoid_y_vals) # Weighted cross entropy weight = tf.constant(0.5) xentropy_weighted_y_vals = tf.nn.weighted_cross_entropy_with_logits( targets, x_vals, weight) xentropy_weighted_y_out = sess.run(xentropy_weighted_y_vals)
def process_decoding_input(targets, word_to_int, batch_size): ending = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1]) decoder_input = tf.concat( [tf.fill([batch_size, 1], word_to_int['<GO>']), ending], 1) return decoder_input
def optimization(self, prev_W, selective = False, splitting = False, expansion = None): if selective: all_var = [ var for var in tf.trainable_variables() if 'layer%d'%self.n_layers in var.name ] else: all_var = [ var for var in tf.trainable_variables() ] l2_losses = [] for var in all_var: l2_losses.append(tf.nn.l2_loss(var)) opt = tf.train.AdamOptimizer(self.lr) regular_terms = [] if not splitting and expansion == None: for var in all_var: if var.name in prev_W.keys(): prev_w = prev_W[var.name] regular_terms.append(tf.nn.l2_loss(var-prev_w)) else: for var in all_var: if var.name in prev_W.keys(): prev_w = prev_W[var.name] if len(prev_w.shape) == 1: sliced = var[:prev_w.shape[0]] else: sliced = var[:prev_w.shape[0], :prev_w.shape[1]] regular_terms.append(tf.nn.l2_loss( sliced - prev_w )) losses = self.loss + self.l2_lambda * tf.reduce_sum(l2_losses) + \ self.regular_lambda * tf.reduce_sum(regular_terms) opt = tf.train.AdamOptimizer(self.lr) grads = opt.compute_gradients(losses, all_var) apply_grads = opt.apply_gradients(grads, global_step = self.g_step) l1_var = [ var for var in tf.trainable_variables() ] l1_op_list = [] with tf.control_dependencies([apply_grads]): for var in l1_var: th_t = tf.fill(tf.shape(var), tf.convert_to_tensor(self.l1_lambda)) zero_t = tf.zeros(tf.shape(var)) var_temp = var - (th_t * tf.sign(var)) l1_op = var.assign(tf.where(tf.less(tf.abs(var), th_t), zero_t, var_temp)) l1_op_list.append(l1_op) GL_var = [var for var in tf.trainable_variables() if 'new' in var.name and ('bw' in var.name or 'tw' in var.name)] gl_op_list = [] with tf.control_dependencies([apply_grads]): for var in GL_var: g_sum = tf.sqrt(tf.reduce_sum(tf.square(var), 0)) th_t = self.gl_lambda gw = [] for i in range(var.get_shape()[1]): temp_gw = var[:, i] - (th_t * var[:, i] / g_sum[i]) gw_gl = tf.where(tf.less(g_sum[i], th_t), tf.zeros(tf.shape(var[:, i])), temp_gw) gw.append(gw_gl) gl_op = var.assign(tf.stack(gw, 1)) gl_op_list.append(gl_op) with tf.control_dependencies(l1_op_list + gl_op_list): self.opt = tf.no_op()
def _dynamic_decode( self, features, encoder_outputs, encoder_state, encoder_sequence_length, tflite_run=False, ): params = self.params batch_size = tf.shape(tf.nest.flatten(encoder_outputs)[0])[0] start_ids = tf.fill([batch_size], constants.START_OF_SENTENCE_ID) beam_size = params.get("beam_width", 1) if beam_size > 1: # Tile encoder outputs to prepare for beam search. encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs, beam_size) encoder_sequence_length = tfa.seq2seq.tile_batch( encoder_sequence_length, beam_size ) encoder_state = tf.nest.map_structure( lambda state: tfa.seq2seq.tile_batch(state, beam_size) if state is not None else None, encoder_state, ) # Dynamically decodes from the encoder outputs. initial_state = self.decoder.initial_state( memory=encoder_outputs, memory_sequence_length=encoder_sequence_length, initial_state=encoder_state, ) ( sampled_ids, sampled_length, log_probs, alignment, _, ) = self.decoder.dynamic_decode( self.labels_inputter, start_ids, initial_state=initial_state, decoding_strategy=decoding.DecodingStrategy.from_params(params), sampler=decoding.Sampler.from_params(params), maximum_iterations=params.get("maximum_decoding_length", 250), minimum_iterations=params.get("minimum_decoding_length", 0), tflite_output_size=params.get("tflite_output_size", 250) if tflite_run else None, ) if tflite_run: return sampled_ids target_tokens = self.labels_inputter.ids_to_tokens.lookup( tf.cast(sampled_ids, tf.int64) ) # Maybe replace unknown targets by the source tokens with the highest attention weight. if params.get("replace_unknown_target", False): if alignment is None: raise TypeError( "replace_unknown_target is not compatible with decoders " "that don't return alignment history" ) if not isinstance(self.features_inputter, inputters.WordEmbedder): raise TypeError( "replace_unknown_target is only defined when the source " "inputter is a WordEmbedder" ) source_tokens = features["tokens"] if beam_size > 1: source_tokens = tfa.seq2seq.tile_batch(source_tokens, beam_size) # Merge batch and beam dimensions. original_shape = tf.shape(target_tokens) target_tokens = tf.reshape(target_tokens, [-1, original_shape[-1]]) align_shape = misc.shape_list(alignment) attention = tf.reshape( alignment, [align_shape[0] * align_shape[1], align_shape[2], align_shape[3]], ) # We don't have attention for </s> but ensure that the attention time dimension matches # the tokens time dimension. attention = reducer.align_in_time(attention, tf.shape(target_tokens)[1]) replaced_target_tokens = replace_unknown_target( target_tokens, source_tokens, attention ) target_tokens = tf.reshape(replaced_target_tokens, original_shape) # Maybe add noise to the predictions. decoding_noise = params.get("decoding_noise") if decoding_noise: target_tokens, sampled_length = _add_noise( target_tokens, sampled_length, decoding_noise, params.get("decoding_subword_token", "■"), params.get("decoding_subword_token_is_spacer"), ) alignment = None # Invalidate alignments. predictions = {"log_probs": log_probs} if self.labels_inputter.tokenizer.in_graph: detokenized_text = self.labels_inputter.tokenizer.detokenize( tf.reshape(target_tokens, [batch_size * beam_size, -1]), sequence_length=tf.reshape(sampled_length, [batch_size * beam_size]), ) predictions["text"] = tf.reshape(detokenized_text, [batch_size, beam_size]) else: predictions["tokens"] = target_tokens predictions["length"] = sampled_length if alignment is not None: predictions["alignment"] = alignment # Maybe restrict the number of returned hypotheses based on the user parameter. num_hypotheses = params.get("num_hypotheses", 1) if num_hypotheses > 0: if num_hypotheses > beam_size: raise ValueError("n_best cannot be greater than beam_width") for key, value in predictions.items(): predictions[key] = value[:, :num_hypotheses] return predictions
def create_sampling_graph(model_fns, features, params, training = False): if isinstance(params, (list, tuple)): params_list = params params = params_list[0] else: params_list = [params] if not isinstance(model_fns, (list, tuple)): model_fns = [model_fns] decode_length = params.decode_length sample_num = params.mrt_sample top_beams = params.top_beams # [batch, decoded_ids] => [batch, vocab_size] def symbols_to_logits_fn(decoded_ids): features["target"] = tf.pad(decoded_ids[:, 1:], [[0, 0], [0, 1]]) features["target_length"] = tf.fill([tf.shape(features["target"])[0]], tf.shape(features["target"])[1]) results = [] for i, model_fn in enumerate(model_fns): results.append(model_fn(features, params_list[i])) return results batch_size = tf.shape(features["source"])[0] # append <bos> symbol bos_id = params.mapping["target"][params.bos] initial_ids = tf.fill([batch_size], tf.constant(bos_id, dtype=tf.int32)) inputs_old = features["source"] inputs_length_old = features["source_length"] if training: outputs_old = features["target"] outputs_length_old = features["target_length"] #return # Expand the inputs in to the number of samples # [batch, length] => [batch, sample_num, length] features["source"] = tf.expand_dims(features["source"], 1) features["source"] = tf.tile(features["source"], [1, sample_num, 1]) shape = tf.shape(features["source"]) # [batch, sample_num, length] => [batch * sample_num, length] features["source"] = tf.reshape(features["source"], [shape[0] * shape[1], shape[2]]) #return # For source sequence length features["source_length"] = tf.expand_dims(features["source_length"], 1) features["source_length"] = tf.tile(features["source_length"], [1, sample_num]) shape = tf.shape(features["source_length"]) # [batch, sample_num, length] => [batch * sample_num, length] features["source_length"] = tf.reshape(features["source_length"], [shape[0] * shape[1]]) vocab_size = len(params.vocabulary["target"]) # Setting decode length to input length + decode_length decode_length = tf.to_float(tf.shape(features["target"])[1]) \ * tf.constant(params.mrt_length_ratio) decode_length = tf.to_int32(decode_length) ids = sampler(symbols_to_logits_fn, initial_ids, params.mrt_sample, decode_length, vocab_size, eos_id=params.mapping["target"][params.eos], features=features) # Set inputs back to the unexpanded inputs to not to confuse the Estimator features["source"] = inputs_old features["source_length"] = inputs_length_old if training: features["target"] = outputs_old features["target_length"] = outputs_length_old return ids
def __init__(self, data, args, embed): self.init_states = tf.placeholder(tf.float32, (None, args.ch_size), 'ctx_inps') # batch*ch_size self.posts = tf.placeholder(tf.int32, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None, ), 'enc_lens') # batch self.origin_responses = tf.placeholder(tf.int32, (None, None), 'dec_inps') # batch*len self.origin_responses_length = tf.placeholder(tf.int32, (None, ), 'dec_lens') # batch # deal with original data to adapt encoder and decoder batch_size, decoder_len = tf.shape(self.origin_responses)[0], tf.shape( self.origin_responses)[1] self.responses = tf.split(self.origin_responses, [1, decoder_len - 1], 1)[1] # no go_id self.responses_length = self.origin_responses_length - 1 self.responses_input = tf.split(self.origin_responses, [decoder_len - 1, 1], 1)[0] # no eos_id self.responses_target = self.responses decoder_len = decoder_len - 1 self.posts_input = self.posts # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.posts_input) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) # build rnn_cell cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size) cell_ctx = tf.nn.rnn_cell.GRUCell(args.ch_size) cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size) # build encoder with tf.variable_scope('encoder'): encoder_output, encoder_state = dynamic_rnn(cell_enc, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('context'): _, self.context_state = cell_ctx(encoder_state, self.init_states) # get output projection function output_fn = MyDense(data.vocab_size, use_bias=True) sampled_sequence_loss = output_projection_layer( args.dh_size, data.vocab_size, args.softmax_samples) # construct helper and attention train_helper = tf.contrib.seq2seq.TrainingHelper( self.decoder_input, tf.maximum(self.responses_length, 1)) infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embed, tf.fill([batch_size], data.go_id), data.eos_id) attn_mechanism = tf.contrib.seq2seq.LuongAttention( args.dh_size, encoder_output, memory_sequence_length=tf.maximum(self.posts_length, 1)) cell_dec_attn = tf.contrib.seq2seq.AttentionWrapper( cell_dec, attn_mechanism, attention_layer_size=args.dh_size) ctx_state_shaping = tf.layers.dense(self.context_state, args.dh_size, activation=None) dec_start = cell_dec_attn.zero_state( batch_size, dtype=tf.float32).clone(cell_state=ctx_state_shaping) # build decoder (train) with tf.variable_scope('decoder'): decoder_train = tf.contrib.seq2seq.BasicDecoder( cell_dec_attn, train_helper, dec_start) train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder_train, impute_finished=True, scope="decoder_rnn") self.decoder_output = train_outputs.rnn_output self.decoder_distribution_teacher, self.decoder_loss = sampled_sequence_loss( self.decoder_output, self.responses_target, self.decoder_mask) # build decoder (test) with tf.variable_scope('decoder', reuse=True): decoder_infer = tf.contrib.seq2seq.BasicDecoder( cell_dec_attn, infer_helper, dec_start, output_layer=output_fn) infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder_infer, impute_finished=True, maximum_iterations=args.max_sent_length, scope="decoder_rnn") self.decoder_distribution = infer_outputs.rnn_output self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, size_layer, num_layers, embedded_size, from_dict_size, to_dict_size): def cells(reuse=False): return tf.nn.rnn_cell.LSTMCell( size_layer, initializer=tf.orthogonal_initializer(), reuse=reuse) self.X = tf.placeholder(tf.int32, [None, None]) self.Y = tf.placeholder(tf.int32, [None, None]) self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) # 计算序列长度 print(self.X_seq_len) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) # 计算序列长度 print(self.Y_seq_len) batch_size = tf.shape(self.X)[0] # 词嵌入 encoder_embedding = tf.Variable( tf.random_uniform([from_dict_size, embedded_size], -1, 1)) decoder_embedding = tf.Variable( tf.random_uniform([to_dict_size, embedded_size], -1, 1)) encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, self.X) # 编码 encoder_cells = tf.nn.rnn_cell.MultiRNNCell( [cells() for _ in range(num_layers)]) self.encoder_out, self.encoder_state = tf.nn.dynamic_rnn( cell=encoder_cells, inputs=encoder_embedded, sequence_length=self.X_seq_len, dtype=tf.float32) encoder_state = tuple(self.encoder_state[-1] for _ in range(num_layers)) # 获取的是每层最后的隐态 # 将self.Y中的样本一个一个提出来 并加上相应的标志 main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1) # 定义解码输出的那个部分 dense = tf.layers.Dense(to_dict_size) # 定义一个dense网络 # 定义解码网络 decoder_cells = tf.nn.rnn_cell.MultiRNNCell( [cells() for _ in range(num_layers)]) training_helper = tf.contrib.seq2seq.TrainingHelper( # 1. 输出进行词嵌入 inputs=tf.nn.embedding_lookup(decoder_embedding, decoder_input), # 2. 获取序列的长度 sequence_length=self.Y_seq_len, # 3. 主轴是否为时间 time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cells, helper=training_helper, initial_state=self.encoder_state, output_layer=dense) training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=tf.reduce_max(self.Y_seq_len)) self.training_logits = training_decoder_output.rnn_output predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=decoder_embedding, start_tokens=tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]), end_token=EOS) predicting_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cells, helper=predicting_helper, initial_state=encoder_state, output_layer=dense) predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=predicting_decoder, impute_finished=True, maximum_iterations=tf.reduce_max(self.X_seq_len)) self.predicting_ids = predicting_decoder_output.sample_id masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32) self.cost = tf.contrib.seq2seq.sequence_loss( logits=self.training_logits, targets=self.Y, weights=masks) self.optimizer = tf.train.AdamOptimizer().minimize(self.cost) y_t = tf.argmax(self.training_logits, axis=2) y_t = tf.cast(y_t, tf.int32) self.prediction = tf.boolean_mask(y_t, masks) mask_label = tf.boolean_mask(self.Y, masks) correct_pred = tf.equal(self.prediction, mask_label) correct_index = tf.cast(correct_pred, tf.float32) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def body(time, outputs_ta, state, inputs, finished, sequence_lengths): """Internal while_loop body. Args: time: scalar int32 tensor. outputs_ta: structure of TensorArray. state: (structure of) state tensors and TensorArrays. inputs: (structure of) input tensors. finished: bool tensor (keeping track of what's finished). sequence_lengths: int32 tensor (keeping track of time of finish). Returns: `(time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)`. ``` """ (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(time, inputs, state) if decoder.tracks_own_finished: next_finished = decoder_finished else: next_finished = tf.logical_or(decoder_finished, finished) next_sequence_lengths = tf.where( tf.logical_not(finished), tf.fill(tf.shape(sequence_lengths), time + 1), sequence_lengths) tf.contrib.framework.nest.assert_same_structure( state, decoder_state) tf.contrib.framework.nest.assert_same_structure( outputs_ta, next_outputs) tf.contrib.framework.nest.assert_same_structure( inputs, next_inputs) # Zero out output values past finish if impute_finished: emit = tf.contrib.framework.nest.map_structure( lambda out, zero: tf.where(finished, zero, out), next_outputs, zero_outputs) else: emit = next_outputs # Copy through states past finish def _maybe_copy_state(new, cur): # TensorArrays and scalar states get passed through. if isinstance(cur, tf.TensorArray): pass_through = True else: new.set_shape(cur.shape) pass_through = (new.shape.ndims == 0) return new if pass_through else tf.where(finished, cur, new) if impute_finished: next_state = tf.contrib.framework.nest.map_structure( _maybe_copy_state, decoder_state, state) else: next_state = decoder_state outputs_ta = tf.contrib.framework.nest.map_structure( lambda ta, out: ta.write(time, out), outputs_ta, emit) return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)
def preprocess_targets(targets, word2int, batch_size): left_side = tf.fill([batch_size, 1], word2int['<SOS>']) right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1]) preprocessed_targets = tf.concat([left_side, right_side], 1) return preprocessed_targets