def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(tf.shape(self.X)[0]) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, tf.shape(self.Y)[1]]) return fmean, fvar
def resize_images(X, height_factor, width_factor, dim_ordering): '''Resizes the images contained in a 4D tensor of shape - [batch, channels, height, width] (for 'th' dim_ordering) - [batch, height, width, channels] (for 'tf' dim_ordering) by a factor of (height_factor, width_factor). Both factors should be positive integers. ''' if dim_ordering == 'th': original_shape = int_shape(X) new_shape = tf.shape(X)[2:] new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32')) X = permute_dimensions(X, [0, 2, 3, 1]) X = tf.image.resize_nearest_neighbor(X, new_shape) X = permute_dimensions(X, [0, 3, 1, 2]) X.set_shape((None, None, original_shape[2] * height_factor, original_shape[3] * width_factor)) return X elif dim_ordering == 'tf': original_shape = int_shape(X) new_shape = tf.shape(X)[1:3] new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32')) X = tf.image.resize_nearest_neighbor(X, new_shape) X.set_shape((None, original_shape[1] * height_factor, original_shape[2] * width_factor, None)) return X else: raise Exception('Invalid dim_ordering: ' + dim_ordering)
def din_fcn_shine(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False): if isinstance(facts, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN # outputs. facts = tf.concat(facts, 2) if time_major: # (T,B,D) => (B,T,D) facts = tf.array_ops.transpose(facts, [1, 0, 2]) # Trainable parameters mask = tf.equal(mask, tf.ones_like(mask)) # D value - hidden size of the RNN layer facts_size = facts.get_shape().as_list()[-1] querry_size = query.get_shape().as_list()[-1] query = tf.layers.dense( query, facts_size, activation=None, name='f1_trans_shine' + stag) query = prelu(query) queries = tf.tile(query, [1, tf.shape(facts)[1]]) queries = tf.reshape(queries, tf.shape(facts)) din_all = tf.concat( [queries, facts, queries - facts, queries * facts], axis=-1) d_layer_1_all = tf.layers.dense( din_all, facts_size, activation=tf.nn.sigmoid, name='f1_shine_att' + stag) d_layer_2_all = tf.layers.dense( d_layer_1_all, facts_size, activation=tf.nn.sigmoid, name='f2_shine_att' + stag) d_layer_2_all = tf.reshape(d_layer_2_all, tf.shape(facts)) output = d_layer_2_all return output
def attention_bah_block(hidden_for_sketch, hidden_for_attn_alignment, attention_depth): """ It is a implementation of the Bahdanau et al. attention mechanism with concat score and the constrained softmax (csoftmax). Based on the papers: https://arxiv.org/abs/1409.0473 "Neural Machine Translation by Jointly Learning to Align and Translate" https://andre-martins.github.io/docs/emnlp2017_final.pdf "Learning What's Easy: Fully Differentiable Neural Easy-First Taggers" Args: hidden_for_sketch: A tensorflow tensor for a sketch computing. This tensor have dimensionality [None, max_num_tokens, sketch_hidden_size] hidden_for_attn_alignment: A tensorflow tensor is aligned for output during a performing. This tensor have dimensionality [None, max_num_tokens, hidden_size_for_attn_alignment] key: A tensorflow tensor with dimensionality [None, None, key_size] attention_depth: Number of usage csoftmax Returns: final_aligned_hiddens: Tensor at the output with dimensionality [1, attention_depth, hidden_size_for_attn_alignment] """ with tf.name_scope('attention_block'): sketch_dims = tf.shape(hidden_for_sketch) batch_size = sketch_dims[0] num_tokens = sketch_dims[1] hidden_size = sketch_dims[2] attn_alignment_dims = tf.shape(hidden_for_attn_alignment) attn_alignment_hidden_size = attn_alignment_dims[2] sketches = [tf.zeros(shape=[batch_size, hidden_size], dtype=tf.float32)] aligned_hiddens = [] cum_att = tf.zeros(shape=[batch_size, num_tokens]) # cumulative attention for i in range(attention_depth): sketch, cum_att_, aligned_hidden = attention_bah_step(hidden_for_sketch, hidden_for_attn_alignment, sketches[-1], cum_att) sketches.append(sketch) #sketch aligned_hiddens.append(aligned_hidden) #sketch cum_att += cum_att_ final_aligned_hiddens = tf.reshape(tf.transpose(tf.stack(aligned_hiddens), [1, 0, 2]),[1, attention_depth, attn_alignment_hidden_size]) return final_aligned_hiddens
def __init__(self, name, input_shape, output_dim, hidden_dim, hidden_nonlinearity=tf.nn.relu, lstm_layer_cls=L.LSTMLayer, output_nonlinearity=None, input_var=None, input_layer=None, forget_bias=1.0, use_peepholes=False, layer_args=None): with tf.variable_scope(name): if input_layer is None: l_in = L.InputLayer(shape=(None, None) + input_shape, input_var=input_var, name="input") else: l_in = input_layer l_step_input = L.InputLayer(shape=(None,) + input_shape, name="step_input") # contains previous hidden and cell state l_step_prev_state = L.InputLayer(shape=(None, hidden_dim * 2), name="step_prev_state") if layer_args is None: layer_args = dict() l_lstm = lstm_layer_cls(l_in, num_units=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, hidden_init_trainable=False, name="lstm", forget_bias=forget_bias, cell_init_trainable=False, use_peepholes=use_peepholes, **layer_args) l_lstm_flat = L.ReshapeLayer( l_lstm, shape=(-1, hidden_dim), name="lstm_flat" ) l_output_flat = L.DenseLayer( l_lstm_flat, num_units=output_dim, nonlinearity=output_nonlinearity, name="output_flat" ) l_output = L.OpLayer( l_output_flat, op=lambda flat_output, l_input: tf.reshape(flat_output, tf.pack((tf.shape(l_input)[0], tf.shape(l_input)[1], -1))), shape_op=lambda flat_output_shape, l_input_shape: (l_input_shape[0], l_input_shape[1], flat_output_shape[-1]), extras=[l_in], name="output" ) l_step_state = l_lstm.get_step_layer(l_step_input, l_step_prev_state, name="step_state") l_step_hidden = L.SliceLayer(l_step_state, indices=slice(hidden_dim), name="step_hidden") l_step_cell = L.SliceLayer(l_step_state, indices=slice(hidden_dim, None), name="step_cell") l_step_output = L.DenseLayer( l_step_hidden, num_units=output_dim, nonlinearity=output_nonlinearity, W=l_output_flat.W, b=l_output_flat.b, name="step_output" ) self._l_in = l_in self._hid_init_param = l_lstm.h0 self._cell_init_param = l_lstm.c0 self._l_lstm = l_lstm self._l_out = l_output self._l_step_input = l_step_input self._l_step_prev_state = l_step_prev_state self._l_step_hidden = l_step_hidden self._l_step_cell = l_step_cell self._l_step_state = l_step_state self._l_step_output = l_step_output self._hidden_dim = hidden_dim
def feed_forward_cnn_small_categorical_fun(action_space, config, observations): """Small cnn network with categorical output.""" del config obs_shape = observations.shape.as_list() x = tf.reshape(observations, [-1] + obs_shape[2:]) with tf.variable_scope("policy"): x = tf.to_float(x) / 255.0 x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn=tf.nn.relu, padding="SAME") x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn=tf.nn.relu, padding="SAME") flat_x = tf.reshape( x, [tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, x.shape.as_list()[1:], 1)]) x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu) logits = tf.contrib.layers.fully_connected(x, action_space.n, activation_fn=None) value = tf.contrib.layers.fully_connected(x, 1, activation_fn=None)[..., 0] policy = tf.contrib.distributions.Categorical(logits=logits) return NetworkOutput(policy, value, lambda a: a)
def dist_info_sym(self, obs_var, state_info_vars): n_batches = tf.shape(obs_var)[0] n_steps = tf.shape(obs_var)[1] obs_var = tf.reshape(obs_var, tf.pack([n_batches, n_steps, -1])) obs_var = tf.cast(obs_var, tf.float32) if self.state_include_action: prev_action_var = state_info_vars["prev_action"] prev_action_var = tf.cast(prev_action_var, tf.float32) all_input_var = tf.concat(2, [obs_var, prev_action_var]) else: all_input_var = obs_var if self.feature_network is None: return dict( prob=L.get_output( self.prob_network.output_layer, {self.l_input: all_input_var} ) ) else: flat_input_var = tf.reshape(all_input_var, (-1, self.input_dim)) return dict( prob=L.get_output( self.prob_network.output_layer, {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} ) )
def hinge_loss(self, y_true, y_pred): # Custom loss function margin = 0.1 # loop counter i = tf.constant(0) # loop condition function c = lambda i, _: tf.less(i, tf.shape(y_true)[0]) outer_sum_loss = tf.constant(0.0) def process_ele(i, outer_sum_loss): # Get a subtensor from batch y_true_one = y_true[i] y_pred_one = y_pred[i] # Stack margin to a num_class*1 matrix margin_stack = tf.reshape(tf.stack([tf.constant(0.1)] * self.num_classes), [self.num_classes, 1]) # Stack true label to a word_dim*num_class matrix and transpose it y_true_one_stack = tf.stack([tf.transpose(y_true_one)] * self.num_classes) # Reshape predict from (word_dim,) to (word_dim,1) y_pred_one_t = tf.reshape(y_pred_one, [self.word_dim, 1]) # Calculate loss r = margin_stack - tf.matmul(y_true_one_stack, y_pred_one_t) + tf.matmul(self.label_vec_tensor, y_pred_one_t) # Summation # We did not exclude true label inside summation, so we subtract extra margin sum_inner_loss = tf.reduce_sum(K.relu(r)) - margin # Return counter++ and accumulated loss return tf.add(i, 1), tf.add(outer_sum_loss, sum_inner_loss) _, outer_sum_loss = tf.while_loop(c, process_ele, [i, outer_sum_loss]) # Return average loss over batch return outer_sum_loss / tf.cast(tf.shape(y_true)[0], dtype=tf.float32)
def _conv_layer(self, name, input_var, stride, in_channels, out_channels, options = {}): activation = options.get('activation', 'relu') dropout = options.get('dropout', None) padding = options.get('padding', 'SAME') batchnorm = options.get('batchnorm', True) transpose = options.get('transpose', False) with tf.variable_scope(name) as scope: if not transpose: filter_shape = [KERNEL_SIZE, KERNEL_SIZE, in_channels, out_channels] else: filter_shape = [KERNEL_SIZE, KERNEL_SIZE, out_channels, in_channels] kernel = tf.get_variable( 'weights', shape=filter_shape, initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / KERNEL_SIZE / KERNEL_SIZE / in_channels)), dtype=tf.float32 ) biases = tf.get_variable( 'biases', shape=[out_channels], initializer=tf.constant_initializer(0.0), dtype=tf.float32 ) if not transpose: output = tf.nn.bias_add( tf.nn.conv2d( input_var, kernel, [1, stride, stride, 1], padding=padding ), biases ) else: batch = tf.shape(input_var)[0] side = tf.shape(input_var)[1] output = tf.nn.bias_add( tf.nn.conv2d_transpose( input_var, kernel, [batch, side * stride, side * stride, out_channels], [1, stride, stride, 1], padding=padding ), biases ) if batchnorm: output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99) if dropout is not None: output = tf.nn.dropout(output, keep_prob=1-dropout) if activation == 'relu': return tf.nn.relu(output, name=scope.name) elif activation == 'sigmoid': return tf.nn.sigmoid(output, name=scope.name) elif activation == 'none': return output else: raise Exception('invalid activation {} specified'.format(activation))
def soft_alignment(U_AP, raw_question_rep, raw_answer_rep, tokens_question_non_zero, tokens_answer_non_zero): """Calculate the AP soft-alignment matrix (in a batch-friendly fashion) :param U_AP: The AP similarity matrix (to be learned) :param raw_question_rep: :param raw_answer_rep: :param tokens_question_non_zero: :param tokens_answer_non_zero: :return: """ answer_transposed = tf.transpose(raw_answer_rep, [0, 2, 1]) # Unfortunately, there is no clean way in TF to multiply a 3d tensor with a 2d tensor. We need to perform some # reshaping. Compare solution 2 on # http://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data raw_question_rep_flat = tf.reshape(raw_question_rep, [-1, tf.shape(raw_question_rep)[2]]) QU_flat = tf.matmul(raw_question_rep_flat, U_AP) QU = tf.reshape(QU_flat, [-1, tf.shape(raw_question_rep)[1], tf.shape(raw_question_rep)[2]]) QUA = tf.batch_matmul(QU, answer_transposed) G = tf.nn.tanh(QUA) # We are now removing all the fields of G that belong to zero padding. To achieve this, we are determining these # fields and adding a value of -2 to all of them (which is guaranteed to result in a smaller number than the minimum # of G, which is -1) additions_G_question = tf.transpose( tf.reshape((tokens_question_non_zero - 1) * 2, [-1, 1, tf.shape(tokens_question_non_zero)[1]]), [0, 2, 1] ) additions_G_answer = tf.reshape((tokens_answer_non_zero - 1) * 2, [-1, 1, tf.shape(tokens_answer_non_zero)[1]]) # G_non_zero contains values of less than -1 for all fields which have a relation to zero-padded token positions G_non_zero = G + additions_G_question + additions_G_answer return G_non_zero
def fast_rcnn_minibatch(self, reference_boxes): with tf.variable_scope('fast_rcnn_minibatch'): reference_boxes_mattached_gtboxes, object_mask, label = \ self.fast_rcnn_find_positive_negative_samples(reference_boxes) positive_indices = tf.reshape(tf.where(tf.not_equal(object_mask, 0.)), [-1]) num_of_positives = tf.minimum(tf.shape(positive_indices)[0], tf.cast(self.fast_rcnn_minibatch_size*self.fast_rcnn_positives_ratio, tf.int32)) positive_indices = tf.random_shuffle(positive_indices) positive_indices = tf.slice(positive_indices, begin=[0], size=[num_of_positives]) negative_indices = tf.reshape(tf.where(tf.equal(object_mask, 0.)), [-1]) num_of_negatives = tf.minimum(tf.shape(negative_indices)[0], self.fast_rcnn_minibatch_size - num_of_positives) negative_indices = tf.random_shuffle(negative_indices) negative_indices = tf.slice(negative_indices, begin=[0], size=[num_of_negatives]) minibatch_indices = tf.concat([positive_indices, negative_indices], axis=0) minibatch_indices = tf.random_shuffle(minibatch_indices) minibatch_reference_boxes_mattached_gtboxes = tf.gather(reference_boxes_mattached_gtboxes, minibatch_indices) object_mask = tf.gather(object_mask, minibatch_indices) label = tf.gather(label, minibatch_indices) label_one_hot = tf.one_hot(label, self.num_classes + 1) return minibatch_indices, minibatch_reference_boxes_mattached_gtboxes, object_mask, label_one_hot
def test_get_multi_class_predictions_for_five_aspect_ratios_per_location( self): num_classes_without_background = 6 image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32) conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( is_training=False, num_classes=num_classes_without_background, conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(), min_depth=0, max_depth=32, num_layers_before_predictor=1, use_dropout=True, dropout_keep_prob=0.8, kernel_size=1, box_code_size=4 ) box_predictions = conv_box_predictor.predict( image_features, num_predictions_per_location=5, scope='BoxPredictor') box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] class_predictions_with_background = box_predictions[ box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) (box_encodings_shape, class_predictions_with_background_shape ) = sess.run([ tf.shape(box_encodings), tf.shape(class_predictions_with_background)]) self.assertAllEqual(box_encodings_shape, [4, 320, 1, 4]) self.assertAllEqual(class_predictions_with_background_shape, [4, 320, num_classes_without_background+1])
def test_get_boxes_for_five_aspect_ratios_per_location_fully_convolutional( self): image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64]) conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( is_training=False, num_classes=0, conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(), min_depth=0, max_depth=32, num_layers_before_predictor=1, use_dropout=True, dropout_keep_prob=0.8, kernel_size=1, box_code_size=4 ) box_predictions = conv_box_predictor.predict( image_features, num_predictions_per_location=5, scope='BoxPredictor') box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] objectness_predictions = box_predictions[ box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] init_op = tf.global_variables_initializer() resolution = 32 expected_num_anchors = resolution*resolution*5 with self.test_session() as sess: sess.run(init_op) (box_encodings_shape, objectness_predictions_shape) = sess.run( [tf.shape(box_encodings), tf.shape(objectness_predictions)], feed_dict={image_features: np.random.rand(4, resolution, resolution, 64)}) self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4]) self.assertAllEqual(objectness_predictions_shape, [4, expected_num_anchors, 1])
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def test_get_correct_box_encoding_and_class_prediction_shapes(self): image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32) proposal_boxes = tf.random_normal([4, 2, 4], dtype=tf.float32) rfcn_box_predictor = box_predictor.RfcnBoxPredictor( is_training=False, num_classes=2, conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(), num_spatial_bins=[3, 3], depth=4, crop_size=[12, 12], box_code_size=4 ) box_predictions = rfcn_box_predictor.predict( image_features, num_predictions_per_location=1, scope='BoxPredictor', proposal_boxes=proposal_boxes) box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] class_predictions_with_background = box_predictions[ box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) (box_encodings_shape, class_predictions_shape) = sess.run( [tf.shape(box_encodings), tf.shape(class_predictions_with_background)]) self.assertAllEqual(box_encodings_shape, [8, 1, 2, 4]) self.assertAllEqual(class_predictions_shape, [8, 1, 3])
def test_get_boxes_with_five_classes_share_box_across_classes(self): image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32) mask_box_predictor = box_predictor.MaskRCNNBoxPredictor( is_training=False, num_classes=5, fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), use_dropout=False, dropout_keep_prob=0.5, box_code_size=4, share_box_across_classes=True ) box_predictions = mask_box_predictor.predict( [image_features], num_predictions_per_location=[1], scope='BoxPredictor') box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] class_predictions_with_background = box_predictions[ box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) (box_encodings_shape, class_predictions_with_background_shape) = sess.run( [tf.shape(box_encodings), tf.shape(class_predictions_with_background)]) self.assertAllEqual(box_encodings_shape, [2, 1, 1, 4]) self.assertAllEqual(class_predictions_with_background_shape, [2, 1, 6])
def K(self, X, X2=None): if X2 is None: d = tf.fill(tf.pack([tf.shape(X)[0]]), tf.squeeze(self.variance)) return tf.diag(d) else: shape = tf.pack([tf.shape(X)[0], tf.shape(X2)[0]]) return tf.zeros(shape, tf.float64)
def CombineArcAndRootPotentials(arcs, roots): """Combines arc and root potentials into a single set of potentials. Args: arcs: [B,N,N] tensor of batched arc potentials. roots: [B,N] matrix of batched root potentials. Returns: [B,N,N] tensor P of combined potentials where P_{b,s,t} = s == t ? roots[b,t] : arcs[b,s,t] """ # All arguments must have statically-known rank. check.Eq(arcs.get_shape().ndims, 3, 'arcs must be rank 3') check.Eq(roots.get_shape().ndims, 2, 'roots must be a matrix') # All arguments must share the same type. dtype = arcs.dtype.base_dtype check.Same([dtype, roots.dtype.base_dtype], 'dtype mismatch') roots_shape = tf.shape(roots) arcs_shape = tf.shape(arcs) batch_size = roots_shape[0] num_tokens = roots_shape[1] with tf.control_dependencies([ tf.assert_equal(batch_size, arcs_shape[0]), tf.assert_equal(num_tokens, arcs_shape[1]), tf.assert_equal(num_tokens, arcs_shape[2])]): return tf.matrix_set_diag(arcs, roots)
def test_get_predictions_with_feature_maps_of_dynamic_shape( self): image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64]) conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( is_training=False, num_classes=0, conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), depth=32, num_layers_before_predictor=1, box_code_size=4) box_predictions = conv_box_predictor.predict( [image_features], num_predictions_per_location=[5], scope='BoxPredictor') box_encodings = tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1) objectness_predictions = tf.concat(box_predictions[ box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) init_op = tf.global_variables_initializer() resolution = 32 expected_num_anchors = resolution*resolution*5 with self.test_session() as sess: sess.run(init_op) (box_encodings_shape, objectness_predictions_shape) = sess.run( [tf.shape(box_encodings), tf.shape(objectness_predictions)], feed_dict={image_features: np.random.rand(4, resolution, resolution, 64)}) self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 4]) self.assertAllEqual(objectness_predictions_shape, [4, expected_num_anchors, 1])
def blend_images(data_folder1, data_folder2, out_folder, alpha=.5): filename_queue = tf.placeholder(dtype=tf.string) label = tf.placeholder(dtype=tf.int32) tensor_image = tf.read_file(filename_queue) image = tf.image.decode_jpeg(tensor_image, channels=3) multiplier = tf.div(tf.constant(224, tf.float32), tf.cast(tf.maximum(tf.shape(image)[0], tf.shape(image)[1]), tf.float32)) x = tf.cast(tf.round(tf.mul(tf.cast(tf.shape(image)[0], tf.float32), multiplier)), tf.int32) y = tf.cast(tf.round(tf.mul(tf.cast(tf.shape(image)[1], tf.float32), multiplier)), tf.int32) image = tf.image.resize_images(image, [x, y]) image = tf.image.rot90(image, k=label) image = tf.image.resize_image_with_crop_or_pad(image, 224, 224) sess = tf.Session() sess.run(tf.local_variables_initializer()) for root, folders, files in os.walk(data_folder1): for each in files: if each.find('.jpg') >= 0: img1 = Image.open(os.path.join(root, each)) img2_path = os.path.join(root.replace(data_folder1, data_folder2), each.split("-")[-1]) rotation = int(each.split("-")[1]) img2 = sess.run(image, feed_dict={filename_queue: img2_path, label: rotation}) imsave(os.path.join(os.getcwd(), "temp", "temp.jpg"), img2) img2 = Image.open(os.path.join(os.getcwd(), "temp", "temp.jpg")) out_image = Image.blend(img1, img2, alpha) outfile = os.path.join(root.replace(data_folder1, out_folder), each) if not os.path.exists(os.path.split(outfile)[0]): os.makedirs(os.path.split(outfile)[0]) out_image.save(outfile) else: print(each) sess.close()
def testDtype(self): with self.test_session(): d = tf.fill([2, 3], 12., name="fill") self.assertEqual(d.get_shape(), [2, 3]) # Test default type for both constant size and dynamic size z = tf.ones([2, 3]) self.assertEqual(z.dtype, tf.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) z = tf.ones(tf.shape(d)) self.assertEqual(z.dtype, tf.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) # Test explicit type control for dtype in (tf.float32, tf.float64, tf.int32, tf.uint8, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.int64, tf.bool): z = tf.ones([2, 3], dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3])) z = tf.ones(tf.shape(d), dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.ones([2, 3]))
def gauss_kl_diag(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum(tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def batch_displacement_warp2d(imgs, vector_fields): """ warp images by free form transformation Parameters ---------- imgs : tf.Tensor images to be warped [n_batch, xlen, ylen, n_channel] vector_fields : tf.Tensor [n_batch, 2, xlen, ylen] Returns ------- output : tf.Tensor warped imagees [n_batch, xlen, ylen, n_channel] """ n_batch = tf.shape(imgs)[0] xlen = tf.shape(imgs)[1] ylen = tf.shape(imgs)[2] grids = batch_mgrid(n_batch, xlen, ylen) T_g = grids + vector_fields output = batch_warp2d(imgs, T_g) return output
def autoencoder_5_5(net_in, LATENT_DIMS): data_shape = net_in.get_shape().as_list() NUM_CHANNELS = data_shape[4] strides = [1, 2, 2, 2, 1] # Define all encoding layers c0 = 5 c1 = 5 padding="VALID" conv0, W0 = conv3d_layer("conv0",net_in,[c0, c0, c0, NUM_CHANNELS, 64], strides=strides, padding=padding) conv1, W1 = conv3d_layer("conv1",conv0,[c1, c1, c1, 64, 128], strides=strides, padding=padding) # Resolve input dim into fc0 from pool1-output #LATENT_DIMS = 100 shape = conv1.get_shape().as_list() print "conv1 shape: ", shape fc0_inputdim = shape[1]*shape[2]*shape[3]*shape[4] fc0 = fc_layer("fc0", conv1, [fc0_inputdim, LATENT_DIMS]) # Start going back by first reshaping into 4D image data again # Then two sets of depooling and convolutions fc1 = fc_layer("fc1", fc0, [LATENT_DIMS,fc0_inputdim]) fc1_reshaped = tf.reshape(fc1, conv1.get_shape().as_list()) deconv0 = conv3d_layer_transpose("deconv0", fc1_reshaped, W1, output_shape=tf.shape(conv0), strides=strides, padding=padding) deconv1 = conv3d_layer_transpose("deconv1", deconv0, W0, output_shape=tf.shape(net_in), strides=strides, padding=padding) return deconv1
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def ae_latent_sample_beam(latents_dense_in, inputs, ed, embed, hparams): """Sample from the latent space in the autoencoder.""" vocab_size = 2**hparams.z_size beam_size = 1 # TODO(lukaszkaiser): larger beam sizes seem to work bad. inputs = tf.tile(inputs, [beam_size, 1, 1]) ed = tf.tile(ed, [beam_size, 1, 1, 1]) def symbols_to_logits_fn(ids): """Go from ids to logits.""" ids = tf.expand_dims(ids, axis=2) # Ids start with added all-zeros. latents_discrete = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0]]) with tf.variable_scope(tf.get_variable_scope(), reuse=False): latents_dense = embed(latents_discrete) latents_pred = decode_transformer( inputs, ed, latents_dense, hparams, "extra") logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits") current_output_position = common_layers.shape_list(ids)[1] - 1 logits = logits[:, current_output_position, :, :] return tf.squeeze(logits, axis=[1]) initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32) length = tf.shape(latents_dense_in)[1] ids, _ = beam_search.beam_search( symbols_to_logits_fn, initial_ids, beam_size, length, vocab_size, alpha=0.0, eos_id=-1, stop_early=False) res = tf.expand_dims(ids[:, 0, :], axis=2) # Pick first beam. return res[:, 1:] # Remove the added all-zeros from ids.
def autoencoder_3_3_3(net_in, LATENT_DIMS): data_shape = net_in.get_shape().as_list() NUM_CHANNELS = data_shape[4] strides = [1, 2, 2, 2, 1] # Define all encoding layers c0 = 3 c1 = 3 c2 = 3 conv0, W0 = conv3d_layer("aeconv0",net_in,[c0, c0, c0, NUM_CHANNELS, 32], strides=strides) conv1, W1 = conv3d_layer("aeconv1",conv0,[c1, c1, c1, 32, 64], strides=strides) conv2, W2 = conv3d_layer("aeconv2",conv1,[c2, c2, c2, 64, 64], strides=[1, 1, 1, 1, 1]) #W0 = tf.Variable(xavier_conv3_init(W0_old.get_shape().as_list()), name='weights') #W1 = tf.Variable(xavier_conv3_init(W1_old.get_shape().as_list()), name='weights') #W2 = tf.Variable(xavier_conv3_init(W2_old.get_shape().as_list()), name='weights') # Resolve input dim into fc0 from pool1-output #LATENT_DIMS = 100 shape = conv2.get_shape().as_list() print "conv2 shape: ", shape fc0_inputdim = shape[1]*shape[2]*shape[3]*shape[4] fc0 = fc_layer("aefc0", conv2, [fc0_inputdim, LATENT_DIMS]) # Start going back by first reshaping into 4D image data again # Then two sets of depooling and convolutions fc1 = fc_layer("aefc1", fc0, [LATENT_DIMS,fc0_inputdim]) fc1_reshaped = tf.reshape(fc1, conv2.get_shape().as_list()) deconv0 = conv3d_layer_transpose("aedeconv0", fc1_reshaped, W2, output_shape=tf.shape(conv1), strides=[1, 1, 1, 1, 1]) deconv1 = conv3d_layer_transpose("aedeconv1", deconv0, W1, output_shape=tf.shape(conv0), strides=strides) deconv2 = conv3d_layer_transpose("aedeconv2", deconv1, W0, output_shape=tf.shape(net_in), strides=strides) return deconv2
def cross_entropy(u, label_u, alpha=0.5, normed=False): label_ip = tf.cast( tf.matmul(label_u, tf.transpose(label_u)), tf.float32) s = tf.clip_by_value(label_ip, 0.0, 1.0) # compute balance param # s_t \in {-1, 1} s_t = tf.multiply(tf.add(s, tf.constant(-0.5)), tf.constant(2.0)) sum_1 = tf.reduce_sum(s) sum_all = tf.reduce_sum(tf.abs(s_t)) balance_param = tf.add(tf.abs(tf.add(s, tf.constant(-1.0))), tf.multiply(tf.div(sum_all, sum_1), s)) if normed: # ip = tf.clip_by_value(tf.matmul(u, tf.transpose(u)), -1.5e1, 1.5e1) ip_1 = tf.matmul(u, tf.transpose(u)) def reduce_shaper(t): return tf.reshape(tf.reduce_sum(t, 1), [tf.shape(t)[0], 1]) mod_1 = tf.sqrt(tf.matmul(reduce_shaper(tf.square(u)), reduce_shaper(tf.square(u)), transpose_b=True)) ip = tf.div(ip_1, mod_1) else: ip = tf.clip_by_value(tf.matmul(u, tf.transpose(u)), -1.5e1, 1.5e1) ones = tf.ones([tf.shape(u)[0], tf.shape(u)[0]]) return tf.reduce_mean(tf.multiply(tf.log(ones + tf.exp(alpha * ip)) - s * alpha * ip, balance_param))
def loop(q_, mask, mass_, found_): q_list = tf.dynamic_partition(q_, mask, 2) condition_indices = tf.dynamic_partition(tf.range(tf.shape(q_)[0]), mask, 2) # 0 element it False, # 1 element if true p = q_list[1] * (1.0 - mass_) / tf.reduce_sum(q_list[1]) p_new = tf.dynamic_stitch(condition_indices, [q_list[0], p]) # condition verification and mask modification less_mask = tf.cast(tf.less(u, p_new), tf.int32) # 0 when u is bigger than p, 1 when u is less than p condition_indices = tf.dynamic_partition(tf.range(tf.shape(p_new)[0]), less_mask, 2) # 0 when u is bigger than p, 1 when u is less than p split_p_new = tf.dynamic_partition(p_new, less_mask, 2) split_u = tf.dynamic_partition(u, less_mask, 2) alpha = tf.dynamic_stitch(condition_indices, [split_p_new[0], split_u[1]]) mass_ += tf.reduce_sum(split_u[1]) mask = mask * (tf.ones_like(less_mask) - less_mask) found_ = tf.cond(tf.equal(tf.reduce_sum(less_mask), 0), lambda: False, lambda: True) alpha = tf.reshape(alpha, q_.shape) return alpha, mask, mass_, found_
def one_hot_matrix(tensor_in, num_classes, on_value=1.0, off_value=0.0): """Encodes indices from given tensor as one-hot tensor. TODO(ilblackdragon): Ideally implementation should be part of TensorFlow with Eigen-native operation. Args: tensor_in: Input tensor of shape [N1, N2]. num_classes: Number of classes to expand index into. on_value: Tensor or float, value to fill-in given index. off_value: Tensor or float, value to fill-in everything else. Returns: Tensor of shape [N1, N2, num_classes] with 1.0 for each id in original tensor. """ tensor_in = tf.convert_to_tensor(tensor_in) sparse_values = tf.to_int64(tf.reshape(tensor_in, [-1, 1])) size = tf.shape(sparse_values)[0] dims = tf.shape(tensor_in) indices = tf.to_int64(tf.reshape(tf.range(0, size), [-1, 1])) indices_values = tf.concat(1, [indices, sparse_values]) outshape = tf.to_int64(expand_concat(0, [size, num_classes])) one_hot_vector = tf.sparse_to_dense(indices_values, outshape, on_value, off_value) ret = tf.reshape(one_hot_vector, tf.concat(0, [dims, [num_classes]])) ret.set_shape(tensor_in.get_shape().concatenate(num_classes)) return ret
def filter_too_long(features, labels): tf.less_equal(tf.shape(features["inputs"])[1], hparams.max_input_length)
def decode_infer(self, inputs, state): # state['enc']: [b * beam, l_s, e] , state['dec']: [b * beam, q', e] # q' = previous decode output length # during infer, following graph are constructed using beam search with self.graph.as_default(): config = self.bert_config target_sequence = inputs['target'] # [b * beam, q'] vocab_size = len(self.hps.vocab_out) # trunct word idx, change those greater than vocab_size to unkId shape = target_sequence.shape unkid = self.hps.vocab_out[self.hps.unk] # target_sequence = tf_trunct(target_sequence, vocab_size, self.hps.unkId) target_sequence = tf_trunct(target_sequence, vocab_size, unkid) target_sequence.set_shape(shape) target_length = inputs['target_length'] target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] # with tf.variable_scope('bert', reuse=True): out_dict_size = len(self.hps.vocab_out) with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (tgt_embed, _) = embedding_lookup( input_ids=target_sequence, vocab_size=out_dict_size, # out vocab size embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. tgt_embed = embedding_postprocessor( input_tensor=tgt_embed, use_token_type=True, token_type_ids=target_seg_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode', reuse=True): # [b, q', e] masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], "causal") decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left infer_decoder_input = decoder_input[:, -1:, :] infer_dec_attn_bias = dec_attn_bias[:, :, -1:, :] ret = transformer_decoder_three(infer_decoder_input, self.enc_output, self.topic_memory, infer_dec_attn_bias, self.enc_attn_bias, self.hps, state=state['decoder']) all_att_weights, decoder_output, decoder_state = ret decoder_output = decoder_output[:, -1, :] # [b * beam, e] vocab_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # [b * beam, v] vocab_probs = tf.nn.softmax(vocab_logits) vocab_size = out_dict_size # out vocabsize # we have tiled source_id_oo before feed, so last argument is set to 1 with tf.variable_scope('copy'): logits = calculate_final_logits(decoder_output, all_att_weights, vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, tgt_seq_len=1) log_prob = tf.log(logits) # [b * beam, v + v'] return log_prob, {'encoder': state['encoder'], 'decoder': decoder_state}
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config gpu_pred_ids = [] gpu_logits = [] gpu_train_encoded = [] gpu_loss = [] gpu_out_embed = [] gpu_grads = [] self._add_placeholders() self._n_gpu_split_placeholders(self.hps.n_gpu) for i in range(self.hps.n_gpu): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse): '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids_ngpu[i], input_mask=self.input_mask_ngpu[i], token_type_ids=self.segment_ids_ngpu[i], use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] hidden_size = encoder_output.shape[2].value encoder_out_length = tf.shape(encoder_output)[1] expand_topic_id = tf.expand_dims(self.topic_ids_ngpu[i], -1) topic_input_sequence = tf.tile(expand_topic_id, [1, encoder_out_length]) with tf.variable_scope('topic'): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.topic_embed, self.topic_embeddings) = embedding_lookup( input_ids=topic_input_sequence, # here the embedding input of decoder have to be output_ids vocab_size=self.hps.num_topic, # decode dictionary modified embedding_size=self.hps.topic_embedding_size, initializer_range=config.initializer_range, word_embedding_name='topic_embeddings', use_one_hot_embeddings=False) print('!!!!topic_embeddings', self.topic_embeddings, self.topic_embed) self.encoder_output = tf.concat([encoder_output, self.topic_embed], -1) self.enc_attn_bias = attention_bias(self.input_mask_ngpu[i], 'masking') out_dict_size = len(self.hps.vocab_out) ## for topic word memory with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.topic_word_memory, _) = embedding_lookup( input_ids=self.topic_words_ids_ngpu[i], # here the embedding input of decoder have to be output_ids vocab_size=out_dict_size, # decode dictionary modified embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) self.topic_word_memory = embedding_postprocessor( input_tensor=self.topic_word_memory, use_token_type=True, token_type_ids=self.mem_segment_ids_ngpu[i], token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=False, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids_ngpu[i], # here the embedding input of decoder have to be output_ids vocab_size=out_dict_size, # decode dictionary modified embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids_ngpu[i], token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask_ngpu[i], -1) self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights, self.decoder_output = transformer_decoder_three(self.decoder_input, self.encoder_output, self.topic_word_memory, self.dec_attn_bias, self.enc_attn_bias, self.hps) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b * l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] # vocab_size = len(self.hps.vocab) with tf.variable_scope('copy'): self.single_logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs, self.input_ids_oo_ngpu[i], self.max_out_oovs, self.input_mask_ngpu[i], out_dict_size, self.tiled_len) # [b * l_t, v + v'] self.single_pred_ids = tf.reshape(tf.argmax(self.single_logits, axis=-1), [self.batch_size, -1]) with tf.variable_scope('loss'): self.single_ce = smooth_cross_entropy( self.single_logits, self.output_label_ngpu[i], self.hps.label_smoothing) self.single_ce = tf.reshape(self.single_ce, tf.shape(self.output_label_ngpu[i])) # [b, l_t] self.single_loss = tf.reduce_sum(self.single_ce * self.output_mask_ngpu[i]) / tf.reduce_sum(self.output_mask_ngpu[i]) # scalar gpu_pred_ids.append(self.single_pred_ids) gpu_logits.append(self.single_logits) gpu_train_encoded.append(self.encoder_output) gpu_loss.append(self.single_loss) gpu_out_embed.append(self.out_embed) params = tf.trainable_variables() grads = tf.gradients(self.single_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) #gpu_ops.append([loss, logits]) self.pred_ids = tf.concat(gpu_pred_ids, axis=0) self.logits = tf.concat(gpu_logits, axis=0) self.loss = tf.reduce_mean(gpu_loss) self.encoder_output = tf.concat(gpu_train_encoded, axis=0) self.out_embed = tf.concat(gpu_out_embed, axis=0) # end for grads = sum_grads(gpu_grads) grads = [g for g, p in grads] self.total_gradient = grads tf.summary.scalar('loss', self.loss)
def knot_weights(positions, num_knots, degree, cyclical, sparse_mode=False, name=None): """Function that converts cardinal B-spline positions to knot weights. Note: In the following, A1 to An are optional batch dimensions. Args: positions: A tensor with shape `[A1, .. An]`. Positions must be between `[0, C - D)` for non-cyclical and `[0, C)` for cyclical splines, where `C` is the number of knots and `D` is the spline degree. num_knots: A strictly positive `int` describing the number of knots in the spline. degree: An `int` describing the degree of the spline, which must be smaller than `num_knots`. cyclical: A `bool` describing whether the spline is cyclical. sparse_mode: A `bool` describing whether to return a result only for the knots with nonzero weights. If set to True, the function returns the weights of only the `degree` + 1 knots that are non-zero, as well as the indices of the knots. name: A name for this op. Defaults to "bspline_knot_weights". Returns: A tensor with dense weights for each control point, with the shape `[A1, ... An, C]` if `sparse_mode` is False. Otherwise, returns a tensor of shape `[A1, ... An, D + 1]` that contains the non-zero weights, and a tensor with the indices of the knots, with the type tf.int32. Raises: ValueError: If degree is greater than 4 or num_knots - 1, or less than 0. InvalidArgumentError: If positions are not in the right range. """ with tf.compat.v1.name_scope(name, "bspline_knot_weights", [positions]): positions = tf.convert_to_tensor(value=positions) if degree > 4 or degree < 0: raise ValueError("Degree should be between 0 and 4.") if degree > num_knots - 1: raise ValueError("Degree cannot be >= number of knots.") if cyclical: positions = asserts.assert_all_in_range(positions, 0.0, float(num_knots)) else: positions = asserts.assert_all_in_range(positions, 0.0, float(num_knots - degree)) all_basis_functions = { # Maps valid degrees to functions. Degree.CONSTANT: _constant, Degree.LINEAR: _linear, Degree.QUADRATIC: _quadratic, Degree.CUBIC: _cubic, Degree.QUARTIC: _quartic } basis_functions = all_basis_functions[degree] if not cyclical and num_knots - degree == 1: # In this case all weights are non-zero and we can just return them. if not sparse_mode: return basis_functions(positions) else: shift = tf.zeros_like(positions, dtype=tf.int32) return basis_functions(positions), shift # shape_batch = positions.shape.as_list() shape_batch = tf.shape(input=positions) positions = tf.reshape(positions, shape=(-1,)) # Calculate the nonzero weights from the decimal parts of positions. shift = tf.floor(positions) sparse_weights = basis_functions(positions - shift) shift = tf.cast(shift, tf.int32) if sparse_mode: # Returns just the weights and the shift amounts, so that tf.gather_nd on # the knots can be used to sparsely activate knots if needed. shape_weights = tf.concat( (shape_batch, tf.constant((degree + 1,), dtype=tf.int32)), axis=0) sparse_weights = tf.reshape(sparse_weights, shape=shape_weights) shift = tf.reshape(shift, shape=shape_batch) return sparse_weights, shift num_positions = tf.size(input=positions) ind_row, ind_col = tf.meshgrid( tf.range(num_positions, dtype=tf.int32), tf.range(degree + 1, dtype=tf.int32), indexing="ij") tiled_shifts = tf.reshape( tf.tile(tf.expand_dims(shift, axis=-1), multiples=(1, degree + 1)), shape=(-1,)) ind_col = tf.reshape(ind_col, shape=(-1,)) + tiled_shifts if cyclical: ind_col = tf.math.mod(ind_col, num_knots) indices = tf.stack((tf.reshape(ind_row, shape=(-1,)), ind_col), axis=-1) shape_indices = tf.concat((tf.reshape( num_positions, shape=(1,)), tf.constant( (degree + 1, 2), dtype=tf.int32)), axis=0) indices = tf.reshape(indices, shape=shape_indices) shape_scatter = tf.concat((tf.reshape( num_positions, shape=(1,)), tf.constant((num_knots,), dtype=tf.int32)), axis=0) weights = tf.scatter_nd(indices, sparse_weights, shape_scatter) shape_weights = tf.concat( (shape_batch, tf.constant((num_knots,), dtype=tf.int32)), axis=0) return tf.reshape(weights, shape=shape_weights)
max_sent_len = seq_train_stories.shape[2] # SEQUENTIAL CONFIGURATION SEQ_EPOCHS = 1 # EPOCHS seq_num_units = 20 # number of units in each LSTMCell seq_num_layers = 2 # number of stacked LSTMs SEQ_KEEP_PRB = 0.9 # dropout probability of keeping value bidirectional = True # enable bidirectional output layer seq_story = tf.placeholder(tf.int64, [None, None, None], "seq_story") # [seq_batch_size x 5 x max_seq_length] seq_order = tf.placeholder(tf.int64, [None, None], "seq_order") # [seq_batch_size x 5] seq_lens = tf.placeholder(tf.int64, [None, None], "seq_lens") # [seq_batch_size x 5] seq_batch_size = tf.shape(seq_story)[0] seq_keep_prob = tf.placeholder(tf.float64) # dropout probability placeholder with tf.variable_scope("seq"): # Word embeddings sentences = [tf.reshape(x, [seq_batch_size, -1]) for x in tf.split(1, 5, seq_story)] # 5 times [seq_batch_size x max_sent_len] embeddings = tf.get_variable("embeddings", initializer=embeds, trainable=True) inputs = [tf.nn.embedding_lookup(embeddings, sentence) # 5 times [seq_batch_size x max_sent_len x embedding_size] for sentence in sentences] with tf.variable_scope("lstms") as varscope: # first LSTM index = 0 lstm1 = tf.nn.rnn_cell.LSTMCell(seq_num_units, state_is_tuple=True) lstm1 = tf.nn.rnn_cell.MultiRNNCell([lstm1] * seq_num_layers) lstm1 = tf.nn.rnn_cell.DropoutWrapper(lstm1, output_keep_prob=seq_keep_prob)
def unpool(inputs): return tf.image.resize_bilinear( inputs, size=[tf.shape(inputs)[1] * 2, tf.shape(inputs)[2] * 2])
learning_rate_decay = 0.9 min_learning_rate = 0.0001 keep_probablity = 0.5 #defining a session tf.reset_default_graph() session = tf.InteractiveSession() # load model inputs inputs, targets, lr, keep_prob = model_inputs() # setting the sequence length sequence_length = tf.placeholder_with_default(25, None, name='sequence_length') # getting the shape of input tensor input_shape = tf.shape(inputs) # getting the training and test predictions training_predictions, test_predictions = seq2seq_model( tf.reverse(inputs, [-1]), targets, keep_prob, batch_size, sequence_length, len(answerswords2int), len(questionswords2int), encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, questionswords2int) # setting up the loss error, the optimizer and gradient clipping with tf.name_scope("optimization"): loss_error = tf.contrib.seq2seq.sequence_loss( training_predictions, targets, tf.ones([input_shape[0], sequence_length])) optimizer = tf.train.AdamOptimizer(learning_rate) gradients = optimizer.compute_gradients(loss_error) clipped_gradients = [(tf.clip_by_value(grad_tensor, -5.,
def simulate_step(batch_env, algo, log=True, reset=False): """Simulation step of a vectorized algorithm with in-graph environments. Integrates the operations implemented by the algorithm and the environments into a combined operation. Args: batch_env: In-graph batch environment. algo: Algorithm instance implementing required operations. log: Tensor indicating whether to compute and return summaries. reset: Tensor causing all environments to reset. Returns: Tuple of tensors containing done flags for the current episodes, possibly intermediate scores for the episodes, and a summary tensor. """ def _define_begin_episode(agent_indices): """Reset environments, intermediate scores and durations for new episodes. Args: agent_indices: Tensor containing batch indices starting an episode. Returns: Summary tensor, new score tensor, and new length tensor. """ assert agent_indices.shape.ndims == 1 zero_scores = tf.zeros_like(agent_indices, tf.float32) zero_durations = tf.zeros_like(agent_indices) update_score = tf.scatter_update(score_var, agent_indices, zero_scores) update_length = tf.scatter_update( length_var, agent_indices, zero_durations) reset_ops = [ batch_env.reset(agent_indices), update_score, update_length] with tf.control_dependencies(reset_ops): return algo.begin_episode(agent_indices), update_score, update_length def _define_step(): """Request actions from the algorithm and apply them to the environments. Increments the lengths of all episodes and increases their scores by the current reward. After stepping the environments, provides the full transition tuple to the algorithm. Returns: Summary tensor, new score tensor, and new length tensor. """ prevob = batch_env.observ + 0 # Ensure a copy of the variable value. agent_indices = tf.range(len(batch_env)) action, step_summary = algo.perform(agent_indices, prevob) action.set_shape(batch_env.action.shape) with tf.control_dependencies([batch_env.step(action)]): add_score = score_var.assign_add(batch_env.reward) inc_length = length_var.assign_add(tf.ones(len(batch_env), tf.int32)) with tf.control_dependencies([add_score, inc_length]): agent_indices = tf.range(len(batch_env)) experience_summary = algo.experience( agent_indices, prevob, batch_env.action, batch_env.reward, batch_env.done, batch_env.observ) summary = tf.summary.merge([step_summary, experience_summary]) return summary, add_score, inc_length def _define_end_episode(agent_indices): """Notify the algorithm of ending episodes. Also updates the mean score and length counters used for summaries. Args: agent_indices: Tensor holding batch indices that end their episodes. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 submit_score = mean_score.submit(tf.gather(score, agent_indices)) submit_length = mean_length.submit( tf.cast(tf.gather(length, agent_indices), tf.float32)) close_env = tf.py_func(batch_env.close, [], []) with tf.control_dependencies([submit_score, submit_length]): return algo.end_episode(agent_indices) def _define_summaries(): """Reset the average score and duration, and return them as summary. Returns: Summary string. """ score_summary = tf.cond( tf.logical_and(log, tf.cast(mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()), str) length_summary = tf.cond( tf.logical_and(log, tf.cast(mean_length.count, tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str) return tf.summary.merge([score_summary, length_summary]) with tf.name_scope('simulate'): log = tf.convert_to_tensor(log) reset = tf.convert_to_tensor(reset) with tf.variable_scope('simulate_temporary'): score_var = tf.get_variable( 'score', (len(batch_env),), tf.float32, tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) length_var = tf.get_variable( 'length', (len(batch_env),), tf.int32, tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) mean_score = streaming_mean.StreamingMean((), tf.float32, 'mean_score') mean_length = streaming_mean.StreamingMean((), tf.float32, 'mean_length') agent_indices = tf.cond( reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)) begin_episode, score, length = tf.cond( tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_begin_episode(agent_indices), lambda: (str(), score_var, length_var)) with tf.control_dependencies([begin_episode]): step, score, length = _define_step() with tf.control_dependencies([step]): agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32) end_episode = tf.cond( tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_end_episode(agent_indices), str) with tf.control_dependencies([end_episode]): summary = tf.summary.merge([ _define_summaries(), begin_episode, step, end_episode]) with tf.control_dependencies([summary]): score = 0.0 + score done = batch_env.done return done, score, summary
def gram(x): shape_x = tf.shape(x) b = shape_x[0] c = shape_x[3] x = tf.reshape(x, [b, -1, c]) return tf.matmul(tf.transpose(x, [0, 2, 1]), x) / tf.cast((tf.size(x) // b), tf.float32)
def max_length(self): time_dim = 0 if self.time_major_optimization else 1 return tf.shape(self.inputs)[time_dim]
def __call__(self, input_, n_caps, kernel_size=9, stride_size=2, route_iter=3, reuse=False, initializer=tf.truncated_normal_initializer(stddev=0.02)): input_shape = input_.get_shape().as_list() batch_size = tf.shape(input_)[0] #[None, height, width, channel] assert len(input_shape) >= 2 if 'primary' in self.name: #Primary Capsule with tf.variable_scope(self.name) as scope: if reuse == True: scope.reuse_variables() if len(input_shape) == 3: input_ = tf.expand_dims(input_, axis=-1) #for gray scale error self.input_shape = input_.get_shape().as_list() capsules = conv2d(input_, n_caps * self.d_caps, kernel_h=kernel_size, stride_h=stride_size, initializer=initializer) #capsule: [None, height, width, channel] capsule_shape = capsules.get_shape().as_list() total_num_cap = int( (capsule_shape[1] * capsule_shape[2] * capsule_shape[3]) / self.d_caps) # for preventing error from getting shape in digit caps. I defined the number of capsules to reshape capsules = tf.reshape(capsules, [batch_size, total_num_cap, self.d_caps]) print(capsules) capsules = squash(capsules) print(capsules) return capsules else: #Digit Capsule with tf.variable_scope(self.name) as scope: if reuse == True: scope.reuse_variables() #let assume input_: [batch_size, # of capsules, d-capsules] #if len(input_shape) ==2 : input_ = tf.expand_dims(input_, axis=-1) self.input_shape = input_.get_shape().as_list() input_tiled = tf.expand_dims( input_, axis=-1, name='input_expand_1' ) #[batch_size, # of capsule, ..., d-capsules, 1] input_tiled = tf.expand_dims( input_tiled, axis=2, name='input_expand_2' ) #[batch_size, # of capsule, ..., d-capsules, 1] print(input_tiled) input_tiled = tf.tile( input_tiled, [1, 1, n_caps, 1, 1], name='input_tile' ) #[batch_size, # of capsule, # of next capsule, d_capsules, 1] print(input_tiled) W = tf.get_variable('prediction_w', [ 1, self.input_shape[1], n_caps, self.d_caps, self.input_shape[2] ], initializer=initializer) W_tiled = tf.tile(W, [batch_size, 1, 1, 1, 1], name='W_tiled') prediction_vectors = tf.matmul(W_tiled, input_tiled) b = tf.zeros([batch_size, self.input_shape[1], n_caps, 1, 1]) for i in range(route_iter): coupling_coeff = tf.nn.softmax(b, dim=2) s = tf.multiply(prediction_vectors, coupling_coeff, name='weighted_prediction') sum_s = tf.reduce_sum(s, axis=1, keep_dims=True, name='weighted_sum') capsules = squash( sum_s, axis=-2 ) #(None, 1, # of nex capsule capsule, d_capsule, 1) caps_out_tile = tf.tile(capsules, [1, self.input_shape[1], 1, 1, 1], name='capsule_output_tiled') a = tf.matmul(prediction_vectors, caps_out_tile, transpose_a=True, name='agreement') b = tf.add(b, a, name='update_logit') capsules = tf.reshape(capsules, [batch_size, n_caps, self.d_caps]) print(capsules) #return tf.squeeze(capsules) return capsules
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [ per_example_loss, label_ids, logits, is_real_example ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def _loss(self, experience, weights=None): """Computes loss for behavioral cloning. Args: experience: A `Trajectory` containing experience. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Returns: loss: A `LossInfo` struct. Raises: ValueError: If the number of actions is greater than 1. """ with tf.name_scope('loss'): if self._nested_actions: actions = experience.action else: actions = tf.nest.flatten(experience.action)[0] batch_size = ( tf.compat.dimension_value(experience.step_type.shape[0]) or tf.shape(experience.step_type)[0]) logits, _ = self._cloning_network( experience.observation, experience.step_type, training=True, network_state=self._cloning_network.get_initial_state(batch_size)) error = self._loss_fn(logits, actions) error_dtype = tf.nest.flatten(error)[0].dtype boundary_weights = tf.cast(~experience.is_boundary(), error_dtype) error *= boundary_weights if nest_utils.is_batched_nested_tensors( experience.action, self.action_spec, num_outer_dims=2): # Do a sum over the time dimension. error = tf.reduce_sum(input_tensor=error, axis=1) # Average across the elements of the batch. # Note: We use an element wise loss above to ensure each element is always # weighted by 1/N where N is the batch size, even when some of the # weights are zero due to boundary transitions. Weighting by 1/K where K # is the actual number of non-zero weight would artificially increase # their contribution in the loss. Think about what would happen as # the number of boundary samples increases. agg_loss = common.aggregate_losses( per_example_loss=error, sample_weight=weights, regularization_loss=self._cloning_network.losses) total_loss = agg_loss.total_loss dict_losses = {'loss': agg_loss.weighted, 'reg_loss': agg_loss.regularization, 'total_loss': total_loss} common.summarize_scalar_dict(dict_losses, step=self.train_step_counter, name_scope='Losses/') if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._cloning_network.trainable_weights: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) if self._debug_summaries: common.generate_tensor_summaries('errors', error, self.train_step_counter) return tf_agent.LossInfo(total_loss, BehavioralCloningLossInfo(loss=error))
def train(train_dir, config, dataset, checkpoints_to_keep=5, keep_checkpoint_every_n_hours=1, num_steps=None, master='', num_sync_workers=0, num_ps_tasks=0, task=0): """Train loop.""" tf.gfile.MakeDirs(train_dir) is_chief = (task == 0) if is_chief: _trial_summary(config.hparams, config.train_examples_path, train_dir) with tf.Graph().as_default(): with tf.device(tf.train.replica_device_setter( num_ps_tasks, merge_devices=True)): model = config.model model.build(config.hparams, config.data_converter.output_depth, is_training=True) optimizer = model.train(**_get_input_tensors(dataset, config)) hooks = [] if num_sync_workers: optimizer = tf.train.SyncReplicasOptimizer( optimizer, num_sync_workers) hooks.append(optimizer.make_session_run_hook(is_chief)) grads, var_list = zip(*optimizer.compute_gradients(model.loss)) global_norm = tf.global_norm(grads) tf.summary.scalar('global_norm', global_norm) if config.hparams.clip_mode == 'value': g = config.hparams.grad_clip clipped_grads = [tf.clip_by_value(grad, -g, g) for grad in grads] elif config.hparams.clip_mode == 'global_norm': clipped_grads = tf.cond( global_norm < config.hparams.grad_norm_clip_to_zero, lambda: tf.clip_by_global_norm( # pylint:disable=g-long-lambda grads, config.hparams.grad_clip, use_norm=global_norm)[0], lambda: [tf.zeros(tf.shape(g)) for g in grads]) else: raise ValueError( 'Unknown clip_mode: {}'.format(config.hparams.clip_mode)) train_op = optimizer.apply_gradients( zip(clipped_grads, var_list), global_step=model.global_step, name='train_step') logging_dict = {'global_step': model.global_step, 'loss': model.loss} hooks.append(tf.train.LoggingTensorHook(logging_dict, every_n_iter=100)) if num_steps: hooks.append(tf.train.StopAtStepHook(last_step=num_steps)) scaffold = tf.train.Scaffold( saver=tf.train.Saver( max_to_keep=checkpoints_to_keep, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)) tf.contrib.training.train( train_op=train_op, logdir=train_dir, scaffold=scaffold, hooks=hooks, save_checkpoint_secs=60, master=master, is_chief=is_chief)
def _batch_shape_tensor(self): return tf.broadcast_dynamic_shape( tf.shape(input=self.loc), tf.shape(input=self.scale))
def __init__(self, config, x, y, x_b, y_b, x_b_v, y_b_v, num_classes_a, num_classes_b, is_training=True, ext_wts=None, y_sel=None, w_class_a=None, b_class_a=None, nshot=None): self._config = config self._is_training = is_training self._num_classes_a = num_classes_a self._num_classes_b = num_classes_b if config.backbone_class == 'resnet_backbone': bb_config = config.resnet_config else: assert False, 'Not supported' opt_config = config.optimizer_config proto_config = config.protonet_config transfer_config = config.transfer_config self._backbone = get_model(config.backbone_class, bb_config) self._inputs = x self._labels = y # if opt_config.num_gpu > 1: # self._labels_all = allgather(self._labels) # else: self._labels_all = self._labels self._inputs_b = x_b self._labels_b = y_b self._inputs_b_v = x_b_v self._labels_b_v = y_b_v # if opt_config.num_gpu > 1: # self._labels_b_v_all = allgather(self._labels_b_v) # else: self._labels_b_v_all = self._labels_b_v self._y_sel = y_sel self._mask = tf.placeholder(tf.bool, [], name='mask') # global_step = tf.get_variable( # 'global_step', shape=[], dtype=tf.int64, trainable=False) global_step = tf.contrib.framework.get_or_create_global_step() self._global_step = global_step log.info('LR decay steps {}'.format(opt_config.lr_decay_steps)) log.info('LR list {}'.format(opt_config.lr_list)) learn_rate = tf.train.piecewise_constant( global_step, list( np.array(opt_config.lr_decay_steps).astype(np.int64)), list(opt_config.lr_list)) self._learn_rate = learn_rate opt = self.get_optimizer(opt_config.optimizer, learn_rate) # if opt_config.num_gpu > 1: # opt = hvd.DistributedOptimizer(opt) with tf.name_scope('TaskA'): h_a = self.backbone(x, is_training=is_training, ext_wts=ext_wts) self._h_a = h_a # Apply BN ops. bn_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.name_scope('TaskB'): x_b_all = tf.concat([x_b, x_b_v], axis=0) if ext_wts is not None: h_b_all = self.backbone( x_b_all, is_training=is_training, reuse=True, ext_wts=ext_wts) else: h_b_all = self.backbone(x_b_all, is_training=is_training, reuse=True) with tf.name_scope('TaskA'): # Calculates hidden activation size. h_shape = h_a.get_shape() h_size = 1 for ss in h_shape[1:]: h_size *= int(ss) if w_class_a is None: if ext_wts is not None: w_class_a = weight_variable( [h_size, num_classes_a], init_method='numpy', dtype=tf.float32, init_param={'val': np.transpose(ext_wts['w_class_a'])}, wd=config.wd, name='w_class_a') b_class_a = weight_variable([], init_method='numpy', dtype=tf.float32, init_param={'val': ext_wts['b_class_a']}, wd=0e0, name='b_class_a') else: w_class_a = weight_variable([h_size, num_classes_a], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.01}, wd=bb_config.wd, name='w_class_a') b_class_a = weight_variable([num_classes_a], init_method='constant', init_param={'val': 0.0}, name='b_class_a') self._w_class_a_orig = w_class_a self._b_class_a_orig = b_class_a else: assert b_class_a is not None w_class_a_orig = weight_variable([h_size, num_classes_a], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.01}, wd=bb_config.wd, name='w_class_a') b_class_a_orig = weight_variable([num_classes_a], init_method='constant', init_param={'val': 0.0}, name='b_class_a') self._w_class_a_orig = w_class_a_orig self._b_class_a_orig = b_class_a_orig self._w_class_a = w_class_a self._b_class_a = b_class_a num_classes_a_dyn = tf.cast(tf.shape(b_class_a)[0], tf.int64) num_classes_a_dyn32 = tf.shape(b_class_a)[0] if proto_config.cosine_a: if proto_config.cosine_tau: if ext_wts is None: init_val = 10.0 else: init_val = ext_wts['tau'][0] tau = weight_variable([], init_method='constant', init_param={'val': init_val}, name='tau') else: tau = tf.constant(1.0) w_class_a_norm = self._normalize(w_class_a, 0) h_a_norm = self._normalize(h_a, 1) dot = tf.matmul(h_a_norm, w_class_a_norm) if ext_wts is not None: dot += b_class_a logits_a = tau * dot else: logits_a = compute_euc(tf.transpose(w_class_a), h_a) self._prediction_a = logits_a # if opt_config.num_gpu > 1: # self._prediction_a_all = allgather(self._prediction_a) # else: self._prediction_a_all = self._prediction_a xent_a = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_a, labels=y) cost_a = tf.reduce_mean(xent_a, name='xent') self._cost_a = cost_a cost_a += self._decay() correct_a = tf.equal(tf.argmax(logits_a, axis=1), y) self._correct_a = correct_a self._acc_a = tf.reduce_mean(tf.cast(correct_a, cost_a.dtype)) with tf.name_scope('TaskB'): h_b = h_b_all[:tf.shape(x_b)[0]] h_b_v = h_b_all[tf.shape(x_b)[0]:] # Add new axes for the `batch` dimension. h_b_ = tf.expand_dims(h_b, 0) h_b_v_ = tf.expand_dims(h_b_v, 0) y_b_ = tf.expand_dims(y_b, 0) y_b_v_ = tf.expand_dims(y_b_v, 0) if transfer_config.old_and_new: protos_b = self._compute_protos(num_classes_b, h_b_, y_b_ - num_classes_a) else: protos_b = self._compute_protos(num_classes_b, h_b_, y_b_) w_class_a_ = tf.expand_dims(tf.transpose(w_class_a), 0) if proto_config.protos_phi: w_p1 = weight_variable([h_size], init_method='constant', dtype=tf.float32, init_param={'val': 1.0}, wd=bb_config.wd, name='w_p1') if proto_config.cosine_attention: w_q = weight_variable([h_size, h_size], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.1}, wd=bb_config.wd, name='w_q') k_b = weight_variable([num_classes_a, h_size], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.1}, wd=bb_config.wd, name='k_b') tau_q = weight_variable([], init_method='constant', init_param={'val': 10.0}, name='tau_q') if transfer_config.old_and_new: w_class_b = self._compute_protos_attend_fix( num_classes_b, h_b_, y_b_ - num_classes_a_dyn, w_q, tau_q, k_b, self._w_class_a_orig) else: w_class_b = self._compute_protos_attend_fix( num_classes_b, h_b_, y_b_, w_q, tau_q, k_b, self._w_class_a_orig) assert proto_config.protos_phi w_p2 = weight_variable([h_size], init_method='constant', dtype=tf.float32, init_param={'val': 1.0}, wd=bb_config.wd, name='w_p2') self._k_b = tf.expand_dims(w_p2, 1) * self._w_class_a_orig self._k_b2 = k_b self.bias = w_class_b self.new_protos = w_p1 * protos_b self.new_bias = w_p2 * w_class_b w_class_b = w_p1 * protos_b + w_p2 * w_class_b self.protos = protos_b self.w_class_b_final = w_class_b else: w_class_b = protos_b if proto_config.protos_phi: w_class_b = w_p1 * w_class_b self._w_class_b = w_class_b if transfer_config.old_and_new: w_class_all = tf.concat([w_class_a_, w_class_b], axis=1) else: w_class_all = w_class_b if proto_config.cosine_softmax_tau: tau_b = weight_variable([], init_method='constant', init_param={'val': 10.0}, name='tau_b') else: tau_b = tf.constant(1.0) if proto_config.similarity == 'euclidean': logits_b_v = compute_logits(w_class_all, h_b_v_) elif proto_config.similarity == 'cosine': logits_b_v = tau_b * compute_logits_cosine(w_class_all, h_b_v_) else: raise ValueError('Unknown similarity') self._logits_b_v = logits_b_v self._prediction_b = logits_b_v[0] # if opt_config.num_gpu > 1: # self._prediction_b_all = allgather(self._prediction_b) # else: self._prediction_b_all = self._prediction_b # Mask out the old classes. def mask_fn(): bin_mask = tf.expand_dims( tf.reduce_sum( tf.one_hot(y_sel, num_classes_a + num_classes_b), 0, keep_dims=True), 0) logits_b_v_m = logits_b_v * (1.0 - bin_mask) logits_b_v_m -= bin_mask * 100.0 return logits_b_v_m # if transfer_config.old_and_new: # logits_b_v = tf.cond(self._mask, mask_fn, lambda: logits_b_v) xent_b_v = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_b_v, labels=y_b_v_) cost_b = tf.reduce_mean(xent_b_v, name='xent') self._cost_b = cost_b if transfer_config.old_and_new: total_cost = cost_b else: total_cost = (transfer_config.cost_a_ratio * cost_a + transfer_config.cost_b_ratio * cost_b) self._total_cost = total_cost if not transfer_config.meta_only: # assert False, 'let us go for pretrained model first' var_list = tf.trainable_variables() var_list = list(filter(lambda x: 'phi' in x.name, var_list)) layers = self.config.transfer_config.meta_layers if layers == "all": pass elif layers == "4": keywords = ['TaskB', 'unit_4_'] filter_fn = lambda x: any([kw in x.name for kw in keywords]) var_list = list(filter(filter_fn, var_list)) else: raise ValueError('Unknown finetune layers {}'.format(layers)) [log.info('Slow weights {}'.format(v.name)) for v in var_list] else: var_list = [] if proto_config.cosine_softmax_tau: var_list += [tau_b] if proto_config.cosine_attention: var_list += [w_q, tau_q, k_b, w_p2] if proto_config.protos_phi: var_list += [w_p1] if transfer_config.train_wclass_a: if proto_config.similarity == 'euclidean': var_list += [w_class_a, b_class_a] elif proto_config.similarity == 'cosine': var_list += [w_class_a] if is_training: grads_and_vars = opt.compute_gradients(total_cost, var_list) with tf.control_dependencies(bn_ops): [log.info('BN op {}'.format(op.name)) for op in bn_ops] train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) grads_and_vars_b = opt.compute_gradients(cost_b, var_list) with tf.control_dependencies(bn_ops): train_op_b = opt.apply_gradients( grads_and_vars_b, global_step=global_step) with tf.control_dependencies(bn_ops): train_op_a = opt.minimize(cost_a, global_step=global_step) self._train_op = train_op self._train_op_a = train_op_a self._train_op_b = train_op_b self._initializer = tf.global_variables_initializer() self._w_class_a = w_class_a
def _random_crop(image_list, crop_height, crop_width): """Crops the given list of images. The function applies the same crop to each image in the list. This can be effectively applied when there are multiple image inputs of the same dimension such as: image, depths, normals = _random_crop([image, depths, normals], 120, 150) Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the new height. crop_width: the new width. Returns: the image_list with cropped images. Raises: ValueError: if there are multiple image inputs provided with different size or the images are smaller than the crop dimensions. """ if not image_list: raise ValueError('Empty image_list.') # Compute the rank assertions. rank_assertions = [] for i in range(len(image_list)): image_rank = tf.rank(image_list[i]) rank_assert = tf.Assert( tf.equal(image_rank, 3), ['Wrong rank for tensor %s [expected] [actual]', image_list[i].name, 3, image_rank]) rank_assertions.append(rank_assert) with tf.control_dependencies([rank_assertions[0]]): image_shape = tf.shape(image_list[0]) image_height = image_shape[0] image_width = image_shape[1] crop_size_assert = tf.Assert( tf.logical_and( tf.greater_equal(image_height, crop_height), tf.greater_equal(image_width, crop_width)), ['Crop size greater than the image size.']) asserts = [rank_assertions[0], crop_size_assert] for i in range(1, len(image_list)): image = image_list[i] asserts.append(rank_assertions[i]) with tf.control_dependencies([rank_assertions[i]]): shape = tf.shape(image) height = shape[0] width = shape[1] height_assert = tf.Assert( tf.equal(height, image_height), ['Wrong height for tensor %s [expected][actual]', image.name, height, image_height]) width_assert = tf.Assert( tf.equal(width, image_width), ['Wrong width for tensor %s [expected][actual]', image.name, width, image_width]) asserts.extend([height_assert, width_assert]) # Create a random bounding box. # # Use tf.random_uniform and not numpy.random.rand as doing the former would # generate random numbers at graph eval time, unlike the latter which # generates random numbers at graph definition time. with tf.control_dependencies(asserts): max_offset_height = tf.reshape(image_height - crop_height + 1, []) with tf.control_dependencies(asserts): max_offset_width = tf.reshape(image_width - crop_width + 1, []) offset_height = tf.random_uniform( [], maxval=max_offset_height, dtype=tf.int32) offset_width = tf.random_uniform( [], maxval=max_offset_width, dtype=tf.int32) return [_crop(image, offset_height, offset_width, crop_height, crop_width) for image in image_list]
def Deconv2D( name, input_dim, output_dim, filter_size, inputs, he_init=True, weightnorm=None, biases=True, gain=1., mask_type=None, ): """ inputs: tensor of shape (batch size, height, width, input_dim) returns: tensor of shape (batch size, 2*height, 2*width, output_dim) """ with tf.name_scope(name) as scope: if mask_type != None: raise Exception('Unsupported configuration') def uniform(stdev, size): return np.random.uniform(low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size).astype('float32') stride = 2 fan_in = input_dim * filter_size**2 / (stride**2) fan_out = output_dim * filter_size**2 if he_init: filters_stdev = np.sqrt(4. / (fan_in + fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2. / (fan_in + fan_out)) if _weights_stdev is not None: filter_values = uniform( _weights_stdev, (filter_size, filter_size, output_dim, input_dim)) else: filter_values = uniform( filters_stdev, (filter_size, filter_size, output_dim, input_dim)) filter_values *= gain filters = lib.param(name + '.Filters', filter_values) if weightnorm == None: weightnorm = _default_weightnorm if weightnorm: norm_values = np.sqrt( np.sum(np.square(filter_values), axis=(0, 1, 3))) target_norms = lib.param(name + '.g', norm_values) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt( tf.reduce_sum(tf.square(filters), reduction_indices=[0, 1, 3])) filters = filters * tf.expand_dims(target_norms / norms, 1) #inputs = tf.transpose(inputs, [0, 3, 1, 2], name='NCHW_to_NHWC') #inputs = tf.transpose(inputs, [0,2,3,1], name='NCHW_to_NHWC') input_shape = tf.shape(inputs) try: # tf pre-1.0 (top) vs 1.0 (bottom) output_shape = tf.pack([ input_shape[0], 2 * input_shape[1], 2 * input_shape[2], output_dim ]) except Exception as e: output_shape = tf.stack([ input_shape[0], 2 * input_shape[1], 2 * input_shape[2], output_dim ]) result = tf.nn.conv2d_transpose(value=inputs, filter=filters, output_shape=output_shape, strides=[1, 2, 2, 1], padding='SAME') if biases: _biases = lib.param(name + '.Biases', np.zeros(output_dim, dtype='float32')) result = tf.nn.bias_add(result, _biases) #result = tf.transpose(result, [0,3,1,2], name='NHWC_to_NCHW') return result
def sample_z(mu, log_var): eps = tf.random_normal(shape=tf.shape(mu)) return mu + tf.exp(log_var / 2) * eps
def _fast_decode(self, features, decode_length, beam_size=1, top_beams=1, alpha=1.0): """Fast decoding. Implements both greedy and beam search decoding, uses beam search iff beam_size > 1, otherwise beam search related arguments are ignored. Args: features: a map of string to model features. decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for slonger translations. Returns: samples: an integer `Tensor`. Top samples from the beam search Raises: NotImplementedError: If there are multiple data shards. """ if self._num_datashards != 1: raise NotImplementedError("Fast decoding only supports a single shard.") dp = self._data_parallelism hparams = self._hparams inputs = features["inputs"] batch_size = tf.shape(inputs)[0] target_modality = self._problem_hparams.target_modality if t2t_model.is_class_modality(target_modality): decode_length = 1 else: decode_length = tf.shape(inputs)[1] + decode_length # TODO(llion): Clean up this reshaping logic. inputs = tf.expand_dims(inputs, axis=1) if len(inputs.shape) < 5: inputs = tf.expand_dims(inputs, axis=4) s = tf.shape(inputs) inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]]) # _shard_features called to ensure that the variable names match inputs = self._shard_features({"inputs": inputs})["inputs"] input_modality = self._problem_hparams.input_modality["inputs"] with tf.variable_scope(input_modality.name): inputs = input_modality.bottom_sharded(inputs, dp) with tf.variable_scope("body"): encoder_output, encoder_decoder_attention_bias = dp( self.encode, inputs, features["target_space_id"], hparams) encoder_output = encoder_output[0] encoder_decoder_attention_bias = encoder_decoder_attention_bias[0] if hparams.pos == "timing": timing_signal = common_attention.get_timing_signal_1d( decode_length + 1, hparams.hidden_size) def preprocess_targets(targets, i): """Performs preprocessing steps on the targets to prepare for the decoder. This includes: - Embedding the ids. - Flattening to 3D tensor. - Optionally adding timing signals. Args: targets: inputs ids to the decoder. [batch_size, 1] i: scalar, Step number of the decoding loop. Returns: Processed targets [batch_size, 1, hidden_dim] """ # _shard_features called to ensure that the variable names match targets = self._shard_features({"targets": targets})["targets"] with tf.variable_scope(target_modality.name): targets = target_modality.targets_bottom_sharded(targets, dp)[0] targets = common_layers.flatten4d3d(targets) # TODO(llion): Explain! Is this even needed? targets = tf.cond( tf.equal(i, 0), lambda: tf.zeros_like(targets), lambda: targets) if hparams.pos == "timing": targets += timing_signal[:, i:i + 1] return targets decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(decode_length)) if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( decode_length) def symbols_to_logits_fn(ids, i, cache): """Go from ids to logits for next symbol.""" ids = ids[:, -1:] targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3) targets = preprocess_targets(targets, i) bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] with tf.variable_scope("body"): body_outputs = dp(self.decode, targets, cache["encoder_output"], cache["encoder_decoder_attention_bias"], bias, hparams, cache) with tf.variable_scope(target_modality.name): logits = target_modality.top_sharded(body_outputs, None, dp)[0] return tf.squeeze(logits, axis=[1, 2, 3]), cache key_channels = hparams.attention_key_channels or hparams.hidden_size value_channels = hparams.attention_value_channels or hparams.hidden_size num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, key_channels]), "v": tf.zeros([batch_size, 0, value_channels]), } for layer in range(num_layers) } # Set 2nd dim to None since it's not invariant in the tf.while_loop # Note: Tensor.set_shape() does not work here since it merges shape info. # TODO(llion); Find a more robust solution. # pylint: disable=protected-access for layer in cache: cache[layer]["k"]._shape = tf.TensorShape([None, None, key_channels]) cache[layer]["v"]._shape = tf.TensorShape([None, None, value_channels]) # pylint: enable=protected-access cache["encoder_output"] = encoder_output cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if beam_size > 1: # Beam Search target_modality = ( self._hparams.problems[self._problem_idx].target_modality) vocab_size = target_modality.top_dimensionality initial_ids = tf.zeros([batch_size], dtype=tf.int32) decoded_ids, scores = beam_search.beam_search( symbols_to_logits_fn, initial_ids, beam_size, decode_length, vocab_size, alpha, states=cache, stop_early=(top_beams == 1)) if top_beams == 1: decoded_ids = decoded_ids[:, 0, 1:] else: decoded_ids = decoded_ids[:, :top_beams, 1:] else: # Greedy def inner_loop(i, next_id, decoded_ids, cache): logits, cache = symbols_to_logits_fn(next_id, i, cache) temperature = (0.0 if hparams.sampling_method == "argmax" else hparams.sampling_temp) next_id = tf.expand_dims( common_layers.sample_with_temperature(logits, temperature), axis=1) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, next_id, decoded_ids, cache decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64) scores = None next_id = tf.zeros([batch_size, 1], dtype=tf.int64) _, _, decoded_ids, _ = tf.while_loop( # TODO(llion): Early stopping. lambda i, *_: tf.less(i, decode_length), inner_loop, [tf.constant(0), next_id, decoded_ids, cache], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), nest.map_structure(lambda t: tf.TensorShape(t.shape), cache), ]) return decoded_ids, scores
def train(train_data, test_data=None): G = train_data[0] features = train_data[1] id_map = train_data[2] if not features is None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1],))]) context_pairs = train_data[3] if FLAGS.random_context else None placeholders = construct_placeholders() minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, num_neg_samples=FLAGS.neg_sample_size, context_pairs=context_pairs) adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") if FLAGS.model == 'graphsage_mean': # Create model sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SampleAndAggregate(placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, model_size=FLAGS.model_size, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'gcn': # Create model sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2)] model = SampleAndAggregate(placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="gcn", model_size=FLAGS.model_size, identity_dim=FLAGS.identity_dim, concat=False, logging=True) elif FLAGS.model == 'graphsage_seq': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SampleAndAggregate(placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, identity_dim=FLAGS.identity_dim, aggregator_type="seq", model_size=FLAGS.model_size, logging=True) elif FLAGS.model == 'graphsage_maxpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SampleAndAggregate(placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="maxpool", model_size=FLAGS.model_size, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'graphsage_meanpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SampleAndAggregate(placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="meanpool", model_size=FLAGS.model_size, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'n2v': model = Node2VecModel(placeholders, features.shape[0], minibatch.deg, # 2x because graphsage uses concat nodevec_dim=2 * FLAGS.dim_1, lr=FLAGS.learning_rate) else: raise Exception('Error: model name unrecognized.') config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION config.allow_soft_placement = True # Initialize session sess = tf.Session(config=config) merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) # Init variables sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) # Train model train_shadow_mrr = None shadow_mrr = None total_steps = 0 avg_time = 0.0 epoch_val_costs = [] train_adj_info = tf.assign(adj_info, minibatch.adj) val_adj_info = tf.assign(adj_info, minibatch.test_adj) for epoch in range(FLAGS.epochs): minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch + 1)) epoch_val_costs.append(0) while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: FLAGS.dropout}) t = time.time() # Training step outs = sess.run([merged, model.opt_op, model.loss, model.ranks, model.aff_all, model.mrr, model.outputs1], feed_dict=feed_dict) negative_sample=sess.run(model.neg_samples,feed_dict=feed_dict) train_cost = outs[2] train_mrr = outs[5] if train_shadow_mrr is None: train_shadow_mrr = train_mrr # else: train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr - train_mrr) if iter % FLAGS.validate_iter == 0: # Validation sess.run(val_adj_info.op) val_cost, ranks, val_mrr, duration = evaluate(sess, model, minibatch, size=FLAGS.validate_batch_size) sess.run(train_adj_info.op) epoch_val_costs[-1] += val_cost if shadow_mrr is None: shadow_mrr = val_mrr else: shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr) if total_steps % FLAGS.print_every == 0: summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) if total_steps % FLAGS.print_every == 0: print("Iter:", '%04d' % iter, "train_loss=", "{:.5f}".format(train_cost), "train_mrr=", "{:.5f}".format(train_mrr), "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), # exponential moving average "val_loss=", "{:.5f}".format(val_cost), "val_mrr=", "{:.5f}".format(val_mrr), "val_mrr_ema=", "{:.5f}".format(shadow_mrr), # exponential moving average "time=", "{:.5f}".format(avg_time)) iter += 1 total_steps += 1 if total_steps > FLAGS.max_total_steps: break if total_steps > FLAGS.max_total_steps: break print("Optimization Finished!") if FLAGS.save_embeddings: sess.run(val_adj_info.op) save_val_embeddings(sess, model, minibatch, FLAGS.validate_batch_size, log_dir()) if FLAGS.model == "n2v": # stopping the gradient for the already trained nodes train_ids = tf.constant( [[id_map[n]] for n in G.nodes_iter() if not G.node[n]['val'] and not G.node[n]['test']], dtype=tf.int32) test_ids = tf.constant([[id_map[n]] for n in G.nodes_iter() if G.node[n]['val'] or G.node[n]['test']], dtype=tf.int32) update_nodes = tf.nn.embedding_lookup(model.context_embeds, tf.squeeze(test_ids)) no_update_nodes = tf.nn.embedding_lookup(model.context_embeds, tf.squeeze(train_ids)) update_nodes = tf.scatter_nd(test_ids, update_nodes, tf.shape(model.context_embeds)) no_update_nodes = tf.stop_gradient( tf.scatter_nd(train_ids, no_update_nodes, tf.shape(model.context_embeds))) model.context_embeds = update_nodes + no_update_nodes sess.run(model.context_embeds) # run random walks from graphsage.utils import run_random_walks nodes = [n for n in G.nodes_iter() if G.node[n]["val"] or G.node[n]["test"]] start_time = time.time() pairs = run_random_walks(G, nodes, num_walks=50) walk_time = time.time() - start_time test_minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, num_neg_samples=FLAGS.neg_sample_size, context_pairs=pairs, n2v_retrain=True, fixed_n2v=True) start_time = time.time() print("Doing test training for n2v.") test_steps = 0 for epoch in range(FLAGS.n2v_test_epochs): test_minibatch.shuffle() while not test_minibatch.end(): feed_dict = test_minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: FLAGS.dropout}) outs = sess.run([model.opt_op, model.loss, model.ranks, model.aff_all, model.mrr, model.outputs1], feed_dict=feed_dict) if test_steps % FLAGS.print_every == 0: print("Iter:", '%04d' % test_steps, "train_loss=", "{:.5f}".format(outs[1]), "train_mrr=", "{:.5f}".format(outs[-2])) test_steps += 1 train_time = time.time() - start_time save_val_embeddings(sess, model, minibatch, FLAGS.validate_batch_size, log_dir(), mod="-test") print("Total time: ", train_time + walk_time) print("Walk time: ", walk_time) print("Train time: ", train_time)
def unique_with_inverse(x): y, idx = tf.unique(x) num_segments = tf.shape(y)[0] num_elems = tf.shape(x)[0] return y, tf.math.unsorted_segment_min(tf.range(num_elems), idx, num_segments)
def forward(self, input, reuse=False): with tf.variable_scope('forward_model'): state = tf.cast(input[0], tf.float32) action = tf.cast(input[1], tf.float32) gru_state = tf.cast(input[2], tf.float32) if self.forward_model_type in ['kl-rssm', 'mmd-rssm']: hidden = tf.concat([action], -1) for i in range(2): hidden = tf.layers.dense(hidden, **dict(units=self.encoding_size, activation=tf.nn.elu), name='prior_enc_{}'.format(i), reuse=tf.AUTO_REUSE) belief, rnn_state = self._cell(hidden, tf.zeros_like(hidden)) prior = { 'belief': belief, } hidden = tf.concat([prior['belief'], state], -1) for i in range(2): hidden = tf.layers.dense(hidden, **dict(units=self.encoding_size, activation=tf.nn.elu), name='post_dec_{}'.format(i), reuse=tf.AUTO_REUSE) mean = tf.layers.dense(hidden, self.state_size, None, name='post_mean', reuse=tf.AUTO_REUSE) sample = mean gru_state = belief next_state = sample divergence_loss = 0. elif self.forward_model_type in ['transformer']: # State embedding state_embedder1 = ops.dense(state, self.state_size, self.encoding_size, tf.nn.relu, "encoder1_state", reuse) divergence_loss = 0. state_embedder2 = ops.dense(state_embedder1, self.encoding_size, self.encoding_size, tf.sigmoid, "encoder2_state", reuse) # Action embedding action_embedder1 = ops.dense(action, self.action_size, self.encoding_size, tf.nn.relu, "encoder1_action", reuse) action_embedder2 = ops.dense(action_embedder1, self.encoding_size, self.encoding_size, tf.sigmoid, "encoder2_action", reuse) # Multi-head if self.use_scale_dot_product: action_embedder3 = ops.dense(action_embedder1, self.encoding_size, self.encoding_size, tf.sigmoid, "value", reuse) batch_size = tf.shape(state)[0] state_embedder2_query = self.split_heads( state_embedder2, batch_size) # query action_embedder2 = self.split_heads( action_embedder2, batch_size) # key action_embedder3 = self.split_heads( action_embedder3, batch_size) # value scaled_attention = self.scaled_dot_product_attention( state_embedder2_query, action_embedder2, action_embedder3, mask=None) # scaled_attention = scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) joint_embedding = tf.reshape( scaled_attention, (batch_size, self.encoding_size)) # Skip Connection if self.use_skip_connection: joint_embedding = ops.dense(joint_embedding, self.encoding_size, self.encoding_size, None, "cross_att_dense", reuse) if self.use_dropout: joint_embedding = tf.nn.dropout( joint_embedding, keep_prob=1 - self.hidden_dropout_prob, ) joint_embedding = joint_embedding + state_embedder2 else: # Joint embedding joint_embedding = tf.multiply(state_embedder2, action_embedder2) # Next state prediction hidden1 = ops.dense(joint_embedding, self.encoding_size, self.encoding_size, tf.nn.relu, "encoder3", reuse) hidden2 = ops.dense(hidden1, self.encoding_size, self.encoding_size, tf.nn.relu, "encoder4", reuse) hidden3 = ops.dense(hidden2, self.encoding_size, self.encoding_size, tf.nn.relu, "decoder1", reuse) next_state = ops.dense(hidden3, self.encoding_size, self.state_size, None, "decoder2", reuse) gru_state = tf.cast(gru_state, tf.float64) else: # State embedding state_embedder1 = ops.dense(state, self.state_size, self.encoding_size, tf.nn.relu, "encoder1_state", reuse) gru_state = ops.gru(state_embedder1, gru_state, self.encoding_size, self.encoding_size, 'gru1', reuse) divergence_loss = 0. state_embedder2 = ops.dense(gru_state, self.encoding_size, self.encoding_size, tf.sigmoid, "encoder2_state", reuse) # Action embedding action_embedder1 = ops.dense(action, self.action_size, self.encoding_size, tf.nn.relu, "encoder1_action", reuse) action_embedder2 = ops.dense(action_embedder1, self.encoding_size, self.encoding_size, tf.sigmoid, "encoder2_action", reuse) # Joint embedding joint_embedding = tf.multiply(state_embedder2, action_embedder2) # Next state prediction hidden1 = ops.dense(joint_embedding, self.encoding_size, self.encoding_size, tf.nn.relu, "encoder3", reuse) hidden2 = ops.dense(hidden1, self.encoding_size, self.encoding_size, tf.nn.relu, "encoder4", reuse) hidden3 = ops.dense(hidden2, self.encoding_size, self.encoding_size, tf.nn.relu, "decoder1", reuse) next_state = ops.dense(hidden3, self.encoding_size, self.state_size, None, "decoder2", reuse) gru_state = tf.cast(gru_state, tf.float64) return next_state, gru_state, divergence_loss
def create_graph(self): print('\n[*] Defining encoder...') with tf.variable_scope('encoder_mean', reuse=self.reuse): Qz_x_mean = DenseNet(input_=self.x_batch_flat, hidden_dim=self.hidden_dim, output_dim=self.z_dim, num_layers=self.num_layers, transfer_fct=self.transfer_fct, act_out=None, reuse=self.reuse, kinit=self.kinit, bias_init=self.bias_init, drop_rate=self.drop_rate) self.encoder_mean = Qz_x_mean.output with tf.variable_scope('encoder_var', reuse=self.reuse): Qz_x_var = DenseNet(input_=self.x_batch_flat, hidden_dim=self.hidden_dim, output_dim=self.z_dim, num_layers=self.num_layers, transfer_fct=self.transfer_fct, act_out=tf.nn.softplus, reuse=self.reuse, kinit=self.kinit, bias_init=self.bias_init, drop_rate=self.drop_rate) self.encoder_var = Qz_x_var.output print('\n[*] Reparameterization trick...') self.encoder_logvar = tf.log(self.encoder_var) eps = tf.random_normal((self.batch_size, self.z_dim), 0, 1, dtype=tf.float32) self.z = tf.add(self.encoder_mean, tf.multiply(tf.sqrt(self.encoder_var), eps)) print('\n[*] Defining decoder...') with tf.variable_scope('decoder_mean', reuse=self.reuse): Px_z_mean = DenseNet(input_=self.z, hidden_dim=self.hidden_dim, output_dim=self.x_flat_dim, num_layers=2, transfer_fct=self.transfer_fct, act_out=tf.nn.sigmoid, reuse=self.reuse, kinit=self.kinit, bias_init=self.bias_init, drop_rate=self.drop_rate) self.decoder_mean_flat = Px_z_mean.output eps = tf.random_normal(tf.shape(self.decoder_mean_flat), 0, 1, dtype=tf.float32) self.decoder_x_flat = tf.add(self.decoder_mean_flat, tf.multiply(tf.sqrt(self.sigma), eps)) self.decoder_x = tf.reshape(self.decoder_x_flat , [-1,self.width, self.height, self.nchannel]) print('\n[*] Defining sampling...') self.z_sample = tf.random_normal((self.batch_size, self.z_dim), 0, 1, dtype=tf.float32) with tf.variable_scope('decoder_mean', reuse=True): Px_z_mean = DenseNet(input_=self.z_sample, hidden_dim=self.hidden_dim, output_dim=self.x_flat_dim, num_layers=2, transfer_fct=self.transfer_fct, act_out=tf.nn.sigmoid, reuse=True, kinit=self.kinit, bias_init=self.bias_init, drop_rate=self.drop_rate) self.samples_mean_flat = Px_z_mean.output eps = tf.random_normal(tf.shape(self.samples_mean_flat), 0, 1, dtype=tf.float32) self.samples_flat = tf.add(self.samples_mean_flat, tf.multiply(tf.sqrt(self.sigma), eps)) self.samples = tf.reshape(self.samples_flat , [-1,self.width, self.height, self.nchannel])
def image_summary_or_default_string(summary_name, image): """Returns image summaries for non-padded elements.""" return tf.cond(tf.equal(tf.size(tf.shape(image)), 4), lambda: tf.summary.image(summary_name, image), lambda: tf.constant(''))
def multihead_attention(queries, keys, num_units = None, num_heads = 8, dropout_rate = 0, is_training = True, causality = False, scope = "multihead_attention", reuse = None): ''' Implement multihead attention Args: queries: [Tensor], A 3-dimensions tensor with shape of [N, T_q, S_q] keys: [Tensor], A 3-dimensions tensor with shape of [N, T_k, S_k] num_units: [Int], Attention size num_heads: [Int], Number of heads dropout_rate: [Float], A ratio of dropout is_training: [Boolean], If true, controller of mechanism for dropout causality: [Boolean], If true, units that reference the future are masked scope: [String], Optional scope for "variable_scope" reuse: [Boolean], If to reuse the weights of a previous layer by the same name Returns: A 3-dimensions tensor with shape of [N, T_q, S] ''' with tf.variable_scope(scope, reuse = reuse): if num_units is None: # length of sentence num_units = queries.get_shape().as_list()[-1] # Linear layers in Figure 2(right) # shape = [N, T_q, S] Q = tf.layers.dense(queries, num_units, activation = tf.nn.relu) # shape = [N, T_k, S] K = tf.layers.dense(keys, num_units, activation = tf.nn.relu) # shape = [N, T_k, S] V = tf.layers.dense(keys, num_units, activation = tf.nn.relu) # Split and concat # shape = [N*h, T_q, S/h] Q_ = tf.concat(tf.split(Q, num_heads, axis = 2), axis = 0) # shape = [N*h, T_k, S/h] K_ = tf.concat(tf.split(K, num_heads, axis = 2), axis = 0) # shape = [N*h, T_k, S/h] V_ = tf.concat(tf.split(V, num_heads, axis = 2), axis = 0) # shape = [N*h, T_q, T_k] outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # Scale outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) # Masking # shape = [N, T_k] key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis = -1))) # shape = [N*h, T_k] key_masks = tf.tile(key_masks, [num_heads, 1]) # shape = [N*h, T_q, T_k] key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # If key_masks == 0 outputs = [1]*length(outputs) paddings = tf.ones_like(outputs) * (-math.pow(2, 32) + 1) # shape = [N*h, T_q, T_k] outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) if causality: # reduce dims : shape = [T_q, T_k] diag_vals = tf.ones_like(outputs[0, :, :]) # shape = [T_q, T_k] # use triangular matrix to ignore the affect from future words # like : [[1,0,0] # [1,2,0] # [1,2,3]] tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # shape = [N*h, T_q, T_k] masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) paddings = tf.ones_like(masks) * (-math.pow(2, 32) + 1) # shape = [N*h, T_q, T_k] outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # Output Activation outputs = tf.nn.softmax(outputs) # Query Masking # shape = [N, T_q] query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis = -1))) # shape = [N*h, T_q] query_masks = tf.tile(query_masks, [num_heads, 1]) # shape = [N*h, T_q, T_k] query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) outputs *= query_masks # Dropouts outputs = tf.layers.dropout(outputs, rate = dropout_rate, training = tf.convert_to_tensor(is_training)) # Weighted sum # shape = [N*h, T_q, S/h] outputs = tf.matmul(outputs, V_) # Restore shape # shape = [N, T_q, S] outputs = tf.concat(tf.split(outputs, num_heads, axis = 0), axis = 2) # Residual connection outputs += queries # Normalize # shape = [N, T_q, S] outputs = normalize(outputs) return outputs
def wct_tf(content, style, alpha, eps=1e-8): '''TensorFlow version of Whiten-Color Transform Assume that content/style encodings have shape 1xHxWxC See p.4 of the Universal Style Transfer paper for corresponding equations: https://arxiv.org/pdf/1705.08086.pdf ''' # Remove batch dim and reorder to CxHxW content_t = tf.transpose(tf.squeeze(content), (2, 0, 1)) style_t = tf.transpose(tf.squeeze(style), (2, 0, 1)) Cc, Hc, Wc = tf.unstack(tf.shape(content_t)) Cs, Hs, Ws = tf.unstack(tf.shape(style_t)) # CxHxW -> CxH*W content_flat = tf.reshape(content_t, (Cc, Hc * Wc)) style_flat = tf.reshape(style_t, (Cs, Hs * Ws)) # Content covariance mc = tf.reduce_mean(content_flat, axis=1, keep_dims=True) fc = content_flat - mc fcfc = tf.matmul(fc, fc, transpose_b=True) / ( tf.cast(Hc * Wc, tf.float32) - 1.) + tf.eye(Cc) * eps # Style covariance ms = tf.reduce_mean(style_flat, axis=1, keep_dims=True) fs = style_flat - ms fsfs = tf.matmul(fs, fs, transpose_b=True) / ( tf.cast(Hs * Ws, tf.float32) - 1.) + tf.eye(Cs) * eps # tf.svd is slower on GPU, see https://github.com/tensorflow/tensorflow/issues/13603 with tf.device('/cpu:0'): Sc, Uc, _ = tf.svd(fcfc) Ss, Us, _ = tf.svd(fsfs) # Filter small singular values k_c = tf.reduce_sum(tf.cast(tf.greater(Sc, 1e-5), tf.int32)) k_s = tf.reduce_sum(tf.cast(tf.greater(Ss, 1e-5), tf.int32)) # Whiten content feature Dc = tf.diag(tf.pow(Sc[:k_c], -0.5)) fc_hat = tf.matmul( tf.matmul(tf.matmul(Uc[:, :k_c], Dc), Uc[:, :k_c], transpose_b=True), fc) # Color content with style Ds = tf.diag(tf.pow(Ss[:k_s], 0.5)) fcs_hat = tf.matmul( tf.matmul(tf.matmul(Us[:, :k_s], Ds), Us[:, :k_s], transpose_b=True), fc_hat) # Re-center with mean of style fcs_hat = fcs_hat + ms # Blend whiten-colored feature with original content feature blended = alpha * fcs_hat + (1 - alpha) * (fc + mc) # CxH*W -> CxHxW blended = tf.reshape(blended, (Cc, Hc, Wc)) # CxHxW -> 1xHxWxC blended = tf.expand_dims(tf.transpose(blended, (1, 2, 0)), 0) return blended
def create_bilstm_classification_model(bert_config, is_training, response_input_ids, response_input_mask, response_segment_ids, response_text_len, response_labels, random_forward_input_ids, random_forward_input_mask, random_forward_segment_ids, random_forward_text_len, random_backward_input_ids, random_backward_input_mask, random_backward_segment_ids, random_backward_text_len, random_labels, swap_forward_input_ids, swap_forward_input_mask, swap_forward_segment_ids, swap_forward_text_len, swap_backward_input_ids, swap_backward_input_mask, swap_backward_segment_ids, swap_backward_text_len, swap_labels, nli_forward_input_ids, nli_forward_input_mask, nli_forward_segment_ids, nli_forward_text_len, nli_backward_input_ids, nli_backward_input_mask, nli_backward_segment_ids, nli_backward_text_len, nli_labels, num_nli_labels, use_one_hot_embeddings, l2_reg_lambda=0.1, dropout_rate=1.0, lstm_size=None, num_layers=1): config = copy.deepcopy(bert_config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): (response_embedding_output, response_embedding_table) = modeling.embedding_lookup( input_ids=response_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) response_embedding_output = modeling.embedding_postprocessor( input_tensor=response_embedding_output, use_token_type=not config.roberta, token_type_ids=response_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # random detection # Perform embedding lookup on the word ids. (random_foward_embedding_output, random_forward_embedding_table) = modeling.embedding_lookup( input_ids=random_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Perform embedding lookup on the word ids. (random_backward_embedding_output, random_backward_embedding_table) = modeling.embedding_lookup( input_ids=random_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. random_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) random_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # swap detection (swap_foward_embedding_output, swap_forward_embedding_table) = modeling.embedding_lookup( input_ids=swap_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (swap_backward_embedding_output, swap_backward_embedding_table) = modeling.embedding_lookup( input_ids=swap_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) swap_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) swap_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # # generic detection # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_forward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_backward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # generic_foward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_foward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_forward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # generic_backward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_backward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_backward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # nli detection (nli_foward_embedding_output, nli_forward_embedding_table) = modeling.embedding_lookup( input_ids=nli_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (nli_backward_embedding_output, nli_backward_embedding_table) = modeling.embedding_lookup( input_ids=nli_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) nli_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) nli_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): response_attention_mask = modeling.create_attention_mask_from_input_mask( response_input_ids, response_input_mask) # [batch_size, from_seq_length, to_seq_length] # mask future tokens diag_vals = tf.ones_like(response_attention_mask[0, :, :]) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() future_masks = tf.tile(tf.expand_dims( tril, 0), [tf.shape(response_attention_mask)[0], 1, 1]) response_attention_mask = tf.math.multiply(response_attention_mask, future_masks) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. response_all_encoder_layers = modeling.transformer_model( input_tensor=response_embedding_output, attention_mask=response_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # random detection # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. random_forward_attention_mask = modeling.create_attention_mask_from_input_mask( random_forward_input_ids, random_forward_input_mask) random_backward_attention_mask = modeling.create_attention_mask_from_input_mask( random_backward_input_ids, random_backward_input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. random_forward_all_encoder_layers = modeling.transformer_model( input_tensor=random_foward_embedding_output, attention_mask=random_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_backward_all_encoder_layers = modeling.transformer_model( input_tensor=random_backward_embedding_output, attention_mask=random_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # swap detection swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_forward_input_ids, swap_forward_input_mask) swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_backward_input_ids, swap_backward_input_mask) swap_forward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_foward_embedding_output, attention_mask=swap_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) swap_backward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_backward_embedding_output, attention_mask=swap_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # # generic detection # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids, # generic_forward_input_mask) # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids, # generic_backward_input_mask) # generic_forward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_foward_embedding_output, # attention_mask=generic_forward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # generic_backward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_backward_embedding_output, # attention_mask=generic_backward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # nli detection nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_forward_input_ids, nli_forward_input_mask) nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_backward_input_ids, nli_backward_input_mask) nli_forward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_foward_embedding_output, attention_mask=nli_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) nli_backward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_backward_embedding_output, attention_mask=nli_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_forward_embedding = random_forward_all_encoder_layers[-2] random_backward_embedding = random_backward_all_encoder_layers[-2] swap_forward_embedding = swap_forward_all_encoder_layers[-2] swap_backward_embedding = swap_backward_all_encoder_layers[-2] # generic_forward_embedding = generic_forward_all_encoder_layers[-2] # generic_backward_embedding = generic_backward_all_encoder_layers[-2] nli_forward_embedding = nli_forward_all_encoder_layers[-2] nli_backward_embedding = nli_backward_all_encoder_layers[-2] response_embedding = response_all_encoder_layers[-2] response_embedding_shape = modeling.get_shape_list(response_embedding, expected_rank=3) with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE): response_logits = tf.layers.dense(response_embedding, config.hidden_size, activation=None) response_logits = modeling.gelu(response_logits) response_logits = modeling.layer_norm(response_logits) response_outputs = tf.layers.dense( response_logits, config.vocab_size, activation=None, use_bias=True, bias_initializer=tf.zeros_initializer()) response_one_hot = tf.one_hot(response_labels, depth=config.vocab_size, dtype=tf.float32) lm_cost = tf.nn.softmax_cross_entropy_with_logits( labels=response_one_hot, logits=response_outputs) sequence_mask = tf.sequence_mask(response_text_len, maxlen=response_embedding_shape[1], dtype=tf.float32) masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask) final_lm_loss = tf.reduce_mean( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) perplexity = tf.exp( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) random_forward_embedding_shape = modeling.get_shape_list( random_forward_embedding, expected_rank=3) random_backward_embedding_shape = modeling.get_shape_list( random_backward_embedding, expected_rank=3) assert random_forward_embedding_shape[ 2] == random_backward_embedding_shape[2] random_forward_embedding = tf.transpose(random_forward_embedding, [1, 0, 2]) random_backward_embedding = tf.transpose(random_backward_embedding, [1, 0, 2]) random_forward_input_mask = tf.cast( tf.transpose(random_forward_input_mask, [1, 0]), tf.float32) random_backward_input_mask = tf.cast( tf.transpose(random_backward_input_mask, [1, 0]), tf.float32) swap_forward_embedding_shape = modeling.get_shape_list( swap_forward_embedding, expected_rank=3) swap_backward_embedding_shape = modeling.get_shape_list( swap_backward_embedding, expected_rank=3) assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2] swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2]) swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2]) swap_forward_input_mask = tf.cast( tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32) swap_backward_input_mask = tf.cast( tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32) # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3) # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3) # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2] # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2]) # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2]) # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32) # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32) nli_forward_embedding_shape = modeling.get_shape_list( nli_forward_embedding, expected_rank=3) nli_backward_embedding_shape = modeling.get_shape_list( nli_backward_embedding, expected_rank=3) assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2] nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2]) nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2]) nli_forward_input_mask = tf.cast( tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32) nli_backward_input_mask = tf.cast( tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32) model = HadeModel( x_random_forward=random_forward_embedding, x_random_mask_forward=random_forward_input_mask, x_random_length_forward=random_forward_text_len, x_random_backward=random_backward_embedding, x_random_mask_backward=random_backward_input_mask, x_random_length_backward=random_backward_text_len, y_random=random_labels, x_swap_forward=swap_forward_embedding, x_swap_mask_forward=swap_forward_input_mask, x_swap_length_forward=swap_forward_text_len, x_swap_backward=swap_backward_embedding, x_swap_mask_backward=swap_backward_input_mask, x_swap_length_backward=swap_backward_text_len, y_swap=swap_labels, # x_generic_forward=generic_forward_embedding, # x_generic_mask_forward=generic_forward_input_mask, # x_generic_length_forward=generic_forward_text_len, # x_generic_backward=generic_backward_embedding, # x_generic_mask_backward=generic_backward_input_mask, # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels, x_nli_forward=nli_forward_embedding, x_nli_mask_forward=nli_forward_input_mask, x_nli_length_forward=nli_forward_text_len, x_nli_backward=nli_backward_embedding, x_nli_mask_backward=nli_backward_input_mask, x_nli_length_backward=nli_backward_text_len, y_nli=nli_labels, embedding_dim=random_forward_embedding_shape[2], num_nli_labels=num_nli_labels, hidden_size=lstm_size, l2_reg_lambda=l2_reg_lambda, num_layers=num_layers, dropout_rate=dropout_rate, is_training=is_training) random_prob, swap_prob, nli_prob, total_cost = model.create_model() return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity
def parameter_attention(x, total_key_depth, total_value_depth, output_depth, memory_rows, num_heads, dropout_rate, name=None): """Attention over parameters. We use the same multi-headed attention as in the other layers, but the memory keys and values are model parameters. There are no linear transformation on the keys or values. We are also a bit more careful about memory usage, since the number of memory positions may be very large. Args: x: a Tensor with shape [batch, length_q, channels] total_key_depth: an integer total_value_depth: an integer output_depth: an integer memory_rows: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number name: an optional string Returns: A Tensor. """ with tf.variable_scope(name, default_name="parameter_attention", values=[x]): head_size_k = total_key_depth // num_heads head_size_v = total_value_depth // num_heads var_shape_k = [num_heads, memory_rows, head_size_k] var_shape_v = [num_heads, memory_rows, head_size_v] k = tf.get_variable( "k", var_shape_k, initializer=tf.random_normal_initializer( 0, output_depth ** -0.5)) * (num_heads ** 0.5) v = tf.get_variable( "v", var_shape_v, initializer=tf.random_normal_initializer( 0, output_depth ** -0.5)) * (output_depth ** 0.5) batch_size = tf.shape(x)[0] length = tf.shape(x)[1] q = common_layers.conv1d(x, total_key_depth, 1, name="q_transform") if dropout_rate: # This is a cheaper form of attention dropout where we use to use # the same dropout decisions across batch elemets and query positions, # but different decisions across heads and memory positions. v = tf.nn.dropout(v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1]) # query is [batch, length, hidden_size] # reshape and transpose it to [heads, batch * length, head_size] q = tf.reshape(q, [batch_size, length, num_heads, head_size_k]) q = tf.transpose(q, [2, 0, 1, 3]) q = tf.reshape(q, [num_heads, batch_size * length, head_size_k]) weights = tf.matmul(q, k, transpose_b=True) weights = tf.nn.softmax(weights) y = tf.matmul(weights, v) y = tf.reshape(y, [num_heads, batch_size, length, head_size_v]) y = tf.transpose(y, [1, 2, 0, 3]) y = tf.reshape(y, [batch_size, length, total_value_depth]) y.set_shape([None, None, total_value_depth]) y = common_layers.conv1d(y, output_depth, 1, name="output_transform") return y
def _create_graph(self): """Creates the computational graph. :return: The computational graph. """ eps = 1e-5 with tf.Graph().as_default() as graph: tf.set_random_seed( self.seed ) # Fix the random seed for randomized tensorflow operations. self.x = tf.placeholder(tf.float32, shape=(None, 1)) self.z = tf.placeholder(tf.float32, shape=(None, 1)) with tf.variable_scope( 'generator'): # Create generator operations. self.G = self._create_generator() self.xg = self.G(self.z) if self.model == 'gan': with tf.variable_scope( 'discriminator'): # Create critic operations. D = self._create_discriminator() # D prob and D logit self.D_real = tf.sigmoid(D(self.x)) # discriminate real self.D_fake = tf.sigmoid(D(self.xg)) # discriminate fake epsilon = tf.random_uniform(shape=tf.shape(self.x), minval=0., maxval=1.) interpolation = epsilon * self.x + (1 - epsilon) * self.xg penalty = ( tf.norm(tf.gradients(D(interpolation), interpolation), axis=1) - 1)**2.0 #GAN self.loss_d = tf.reduce_mean(-tf.log(self.D_real + eps) - tf.log(1 - self.D_fake + eps)) * 0.5 self.loss_g = tf.reduce_mean(-tf.log(self.D_fake + eps)) elif self.model == 'rgan': with tf.variable_scope( 'encoder'): # Create encoder operations. self.E = self._create_encoder() self.ze = self.E(self.x) self.xr = self.G(self.ze) with tf.variable_scope( 'discriminator'): # Create critic operations. D = self._create_discriminator() # D prob and D logit self.D_real = tf.sigmoid(D(self.x)) # discriminate real self.D_fake = tf.sigmoid(D(self.xg)) # discriminate fake self.D_recon = tf.sigmoid(D(self.xr)) # discriminate recon self.mse = tf.reduce_sum(tf.square(self.x - self.xr), 1) lambda1 = 1e-2 lambda2 = 1e-2 self.loss_d = tf.reduce_mean(-tf.log(self.D_real + eps) - tf.log(1 - self.D_fake + eps)) self.loss_e = tf.reduce_mean(lambda1 * self.mse + lambda2 * self.D_recon) self.loss_g = tf.reduce_mean(-tf.log(self.D_fake + eps) + self.loss_e) elif self.model == 'mdgan': with tf.variable_scope( 'encoder'): # Create encoder operations. self.E = self._create_encoder() self.ze = self.E(self.x) self.xr = self.G(self.ze) with tf.variable_scope( 'discriminator1'): # Create critic operations. D1 = self._create_discriminator() # D prob and D logit self.D1_real = tf.sigmoid(D1(self.x)) # discriminate real self.D1_recon = tf.sigmoid(D1( self.xr)) # discriminate recon self.mse = tf.reduce_sum(tf.square(self.x - self.xr), 1) lambda1 = 1e-2 lambda2 = 1e-2 self.loss_d1 = tf.reduce_mean(-tf.log(self.D1_real + eps) - tf.log(1 - self.D1_recon + eps)) self.loss_g1 = tf.reduce_mean(self.mse - lambda1 * self.D1_recon) with tf.variable_scope( 'discriminator'): # Create critic operations. D = self._create_discriminator() # D prob and D logit self.D_fake = tf.sigmoid(D(self.xg)) # discriminate fake self.D_real = tf.sigmoid(D(self.xr)) # discriminate recon self.loss_d = tf.reduce_mean(-tf.log(self.D_real + eps) - tf.log(1 - self.D_fake + eps)) self.loss_g = tf.reduce_mean(-tf.log(self.D_fake + eps)) elif self.model == 'vaegan': self.ep = tf.random_normal(shape=[tf.shape(self.x)[0], 1]) with tf.variable_scope( 'encoder'): # Create encoder operations. self.Em, self.Es = self._create_encoder_vaegan() self.ze_m = self.Em(self.x) self.ze_s = self.Es(self.x) self.ze_x = tf.add(self.ze_m, tf.sqrt(tf.exp(self.ze_s)) * self.ep) self.xr = self.G(self.ze_x) with tf.variable_scope('discriminator'): D = self._create_discriminator() # D prob and D logit. self.D_real = tf.sigmoid(D(self.x)) self.D_recon = tf.sigmoid(D(self.xr)) self.D_fake = tf.sigmoid(D(self.xg)) # if want to use gradient penalty? # epsilon = tf.random_uniform(shape=tf.shape(self.x), minval=0., maxval=1.) # interpolation = epsilon * self.x + (1 - epsilon) * self.xg # penalty = (tf.norm(tf.gradients(D(interpolation), interpolation), axis=1) - 1) ** 2.0 # kl loss self.kl = self.kl_loss(self.ze_m, self.ze_s) # gan loss self.ld = tf.reduce_mean(-tf.log(self.D_real + eps) - tf.log(1 - self.D_recon + eps) - tf.log(1 - self.D_fake + eps)) # preceptual loss (feature loss or reconstruction loss) self.lr = tf.reduce_mean(self.nll_normal(self.xr, self.x)) # encoder self.le = -self.lr + self.kl / (self.n_batch) # generator self.lg = tf.reduce_mean(-tf.log(self.D_fake + eps) - tf.log(self.D_recon + eps)) - 1e-6 * self.lr self.loss_d = self.ld #+ self.lambda_reg * penalty #for use penalty as gaan self.loss_g = self.lg self.loss_e = self.le self.loss_d = tf.reshape(self.loss_d, []) #convert to scalar elif self.model == 'wgangp': with tf.variable_scope( 'discriminator'): # Create critic operations. D = self._create_discriminator() # D prob and D logit. self.D_real = D(self.x) # Criticize real data. self.D_fake = D(self.xg) # Criticize generated data. diff = tf.abs(tf.reduce_mean(self.D_real - self.D_fake)) # Create the gradient penalty operations. epsilon = tf.random_uniform(shape=tf.shape(self.x), minval=0., maxval=1.) interpolation = epsilon * self.x + (1 - epsilon) * self.xg penalty = ( tf.norm(tf.gradients(D(interpolation), interpolation), axis=1) - 1)**2.0 self.loss_d = tf.reduce_mean(self.D_fake - self.D_real + self.lambda_reg * penalty) self.loss_g = -tf.reduce_mean(self.D_fake) elif self.model == 'gaan': with tf.variable_scope( 'encoder'): # Create generator operations. self.E = self._create_encoder() self.ze = self.E(self.x) self.xr = self.G(self.ze) #encoder xg self.zg = self.E(self.xg) with tf.variable_scope( 'discriminator'): # Create critic operations. D = self._create_discriminator() # D prob and D logit self.D_real_logit = D(self.x) # discriminate real self.D_fake_logit = D(self.xg) # discriminate fake self.D_recon_logit = D(self.xr) # discriminate fake self.D_real = tf.sigmoid( self.D_real_logit) # discriminate real self.D_fake = tf.sigmoid( self.D_fake_logit) # discriminate fake self.D_recon = tf.sigmoid( self.D_recon_logit) # discriminate fake diff = tf.abs(tf.reduce_mean(self.D_real - self.D_fake)) # Create the gradient penalty operations. epsilon = tf.random_uniform(shape=tf.shape(self.x), minval=0., maxval=1.) interpolation = epsilon * self.x + (1 - epsilon) * self.xg penalty = ( tf.norm(tf.gradients(D(interpolation), interpolation), axis=1) - 1)**2.0 #penalty = (tf.norm(tf.gradients(D(interpolation), interpolation), axis=1) - tf.minimum(diff,1.0)) ** 2.0 self.recon = tf.reduce_mean( tf.square(self.x - self.xr)) #reconstruction self.loss_x = tf.reduce_mean(self.x - self.xg) self.loss_z = tf.reduce_mean(self.ze - self.z) self.reg = tf.square(self.loss_x - self.loss_z) self.ld = tf.reduce_mean(-0.5 * tf.log(self.D_real + eps) - 0.5 * tf.log(self.D_recon + eps) - tf.log(1 - self.D_fake + eps)) self.lg = tf.abs(tf.reduce_mean(self.D_real - self.D_fake)) self.loss_d = self.ld + self.lambda_reg * penalty self.loss_r = self.recon + 0.1 * self.reg self.loss_g = self.lg self.loss_d = tf.reshape(self.loss_d, []) #convert to scalar self.loss_g = tf.reshape(self.loss_g, []) #// self.loss_r = tf.reshape(self.loss_r, []) #// # Store the variables of the critic and the generator. self.vars_d = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator') self.vars_g = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') # Create optimizer operations for critic and generator. self.opt_d = self._create_optimizer(self.loss_d, self.vars_d, self.learning_rate, self.beta1, self.beta2) self.opt_g = self._create_optimizer(self.loss_g, self.vars_g, self.learning_rate, self.beta1, self.beta2) if self.model == 'rgan': self.vars_e = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='encoder') self.opt_e = self._create_optimizer(self.loss_e, self.vars_e, self.learning_rate, self.beta1, self.beta2) elif self.model == 'mdgan': self.vars_e = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='encoder') self.vars_d1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator1') self.opt_d1 = self._create_optimizer(self.loss_d1, self.vars_d1, self.learning_rate, self.beta1, self.beta2) self.opt_g1 = self._create_optimizer(self.loss_g1, self.vars_g + self.vars_e, self.learning_rate, self.beta1, self.beta2) elif self.model == 'vaegan': self.vars_e = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='encoder') #for D self.trainer_D = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate) self.gradients_D = self.trainer_D.compute_gradients( self.loss_d, var_list=self.vars_d) self.clipped_gradients_D = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in self.gradients_D] self.opti_D = self.trainer_D.apply_gradients( self.clipped_gradients_D) #for G self.trainer_G = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate) self.gradients_G = self.trainer_G.compute_gradients( self.loss_g, var_list=self.vars_g) self.clipped_gradients_G = [(tf.clip_by_value(_[0], -1, 1.), _[1]) for _ in self.gradients_G] self.opti_G = self.trainer_G.apply_gradients( self.clipped_gradients_G) #for E self.trainer_E = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate) self.gradients_E = self.trainer_E.compute_gradients( self.loss_e, var_list=self.vars_e) self.clipped_gradients_E = [(tf.clip_by_value(_[0], -1, 1.), _[1]) for _ in self.gradients_E] self.opti_E = self.trainer_E.apply_gradients( self.clipped_gradients_E) elif self.model == 'gaan': self.vars_e = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='encoder') self.opt_r = self._create_optimizer(self.loss_r, self.vars_e + self.vars_g, self.learning_rate, self.beta1, self.beta2) self.opt_recon = self._create_optimizer( self.recon, self.vars_e + self.vars_g, self.learning_rate, self.beta1, self.beta2) summary_enc_1_params = self.get_weights('encoder/dense/kernel') summary_enc_2_params = self.get_weights('encoder/dense_1/kernel') tf.summary.histogram('enc_1_params', summary_enc_1_params) tf.summary.histogram('enc_2_params', summary_enc_2_params) summary_gen_1_params = self.get_weights('generator/dense/kernel') summary_gen_2_params = self.get_weights('generator/dense_1/kernel') tf.summary.histogram('gen_1_params', summary_gen_1_params) tf.summary.histogram('gen_2_params', summary_gen_2_params) summary_disc_1_params = self.get_weights( 'discriminator/dense/kernel') summary_disc_2_params = self.get_weights( 'discriminator/dense_1/kernel') tf.summary.histogram('disc_1_params', summary_disc_1_params) tf.summary.histogram('disc_2_params', summary_disc_2_params) # Create variable initialization operation. self.init = tf.global_variables_initializer() #graph.finalize() return graph