def testBaseline(self, cls, num_microbatches, expected_answer): with self.cached_session() as sess: var0 = tf.Variable([1.0, 2.0]) data0 = tf.Variable([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0]]) ledger = privacy_ledger.PrivacyLedger(1e6, num_microbatches / 1e6, 50, 50) dp_average_query = gaussian_query.GaussianAverageQuery( 1.0e9, 0.0, num_microbatches, ledger) dp_average_query = privacy_ledger.QueryWithLedger( dp_average_query, ledger) opt = cls(dp_average_query, num_microbatches=num_microbatches, learning_rate=2.0) self.evaluate(tf.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) # Expected gradient is sum of differences divided by number of # microbatches. gradient_op = opt.compute_gradients(loss(data0, var0), [var0]) grads_and_vars = sess.run(gradient_op) self.assertAllCloseAccordingToType(expected_answer, grads_and_vars[0][0])
def rnn_model_fn(features, labels, mode): # pylint: disable=unused-argument """Model function for a RNN.""" # Define RNN architecture using tf.keras.layers. x = features['x'] x = tf.reshape(x, [-1, SEQ_LEN]) input_layer = x[:, :-1] input_one_hot = tf.one_hot(input_layer, 256) # if FLAGS.float16: # input_one_hot = tf.cast(input_one_hot, tf.float16) lstm = tf.keras.layers.LSTM(256, return_sequences=True).apply(input_one_hot) logits = tf.keras.layers.Dense(256).apply(lstm) # Calculate loss as a vector (to support microbatches in DP-SGD). vector_loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.cast( tf.one_hot(x[:, 1:], 256), dtype=tf.float32), logits=logits) # Define mean of loss across minibatch (for reporting through tf.Estimator). scalar_loss = tf.reduce_mean(vector_loss) # Configure the training op (for TRAIN mode). if mode == tf.estimator.ModeKeys.TRAIN: if FLAGS.dpsgd: ledger = privacy_ledger.PrivacyLedger( population_size=NB_TRAIN, selection_probability=(FLAGS.batch_size / NB_TRAIN), max_samples=1e6, max_queries=1e6) optimizer = dp_optimizer.DPAdamGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate, unroll_microbatches=True) opt_loss = vector_loss else: optimizer = tf.train.AdamOptimizer( learning_rate=FLAGS.learning_rate) opt_loss = scalar_loss global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op) # Add evaluation metrics (for EVAL mode). elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=tf.cast(x[:, 1:], dtype=tf.int32), predictions=tf.argmax(input=logits, axis=2)) } return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)
def testNoiseMultiplier(self, cls): with tf.GradientTape(persistent=True) as gradient_tape: var0 = tf.Variable([0.0]) data0 = tf.Variable([[0.0]]) ledger = privacy_ledger.PrivacyLedger(1e6, 1 / 1e6, 5000, 5000) dp_average_query = gaussian_query.GaussianAverageQuery(4.0, 8.0, 1) dp_average_query = privacy_ledger.QueryWithLedger( dp_average_query, ledger) opt = cls(dp_average_query, num_microbatches=1, learning_rate=2.0) self.evaluate(tf.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([0.0], self.evaluate(var0)) grads = [] for _ in range(1000): grads_and_vars = opt.compute_gradients( lambda: self._loss_fn(var0, data0), [var0], gradient_tape=gradient_tape) grads.append(grads_and_vars[0][0]) # Test standard deviation is close to l2_norm_clip * noise_multiplier. self.assertNear(np.std(grads), 2.0 * 4.0, 0.5)
def __init__( self, l2_norm_clip, noise_multiplier, num_microbatches, unroll_microbatches=False, *args, # pylint: disable=keyword-arg-before-vararg **kwargs): dp_average_query = gaussian_query.GaussianAverageQuery( l2_norm_clip, l2_norm_clip * noise_multiplier, num_microbatches) if 'population_size' in kwargs: population_size = kwargs.pop('population_size') max_queries = kwargs.pop('ledger_max_queries', 1e6) max_samples = kwargs.pop('ledger_max_samples', 1e6) selection_probability = num_microbatches / population_size ledger = privacy_ledger.PrivacyLedger( population_size, selection_probability, max_samples, max_queries) dp_average_query = privacy_ledger.QueryWithLedger( dp_average_query, ledger) super(DPGaussianOptimizerClass, self).__init__( dp_average_query, num_microbatches, unroll_microbatches, *args, **kwargs)
def test_basic(self): ledger = privacy_ledger.PrivacyLedger(10, 0.1, 50, 50) ledger.record_sum_query(5.0, 1.0) ledger.record_sum_query(2.0, 0.5) ledger.finalize_sample() expected_queries = [[5.0, 1.0], [2.0, 0.5]] formatted = ledger.get_formatted_ledger_eager() sample = formatted[0] self.assertAllClose(sample.population_size, 10.0) self.assertAllClose(sample.selection_probability, 0.1) self.assertAllClose(sorted(sample.queries), sorted(expected_queries))
def model_fn(features, labels, mode): logits = linear_layer(features) # vector loss: each component of the vector correspond to an individual training point and label. # Use for per example gradient later. vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(labels, dtype=tf.int64))#=labels) # change compare w mnist scalar_loss = tf.reduce_mean(vector_loss) print('*******************') print(vector_loss.dtype) print(scalar_loss.dtype) if mode == tf.estimator.ModeKeys.TRAIN: if FLAGS.dpsgd: ledger = privacy_ledger.PrivacyLedger( population_size=60000, selection_probability=(FLAGS.batch_size / 60000)) optimizer = dp_optimizer.DPGradientDescentGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate) training_hooks = [ EpsilonPrintingTrainingHook(ledger) ] opt_loss = vector_loss else: optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate) #train_op = optimizer.minimize(scalar_loss, # global_step=tf.train.get_global_step()) opt_loss = scalar_loss training_hooks = [] global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op, training_hooks=training_hooks) elif mode == tf.estimator.ModeKeys.EVAL: # pred_probas = tf.nn.softmax(logits) # should I remove this ? pred_classes = tf.argmax(logits, axis=1) acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes) return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, eval_metric_ops={'accuracy':acc_op})
def test_ledger(self): record1 = tf.constant([8.5]) record2 = tf.constant([-7.25]) population_size = tf.Variable(0) selection_probability = tf.Variable(0.0) ledger = privacy_ledger.PrivacyLedger( population_size, selection_probability, 50, 50) query = quantile_adaptive_clip_sum_query.QuantileAdaptiveClipSumQuery( initial_l2_norm_clip=10.0, noise_multiplier=1.0, target_unclipped_quantile=0.0, learning_rate=1.0, clipped_count_stddev=0.0, expected_num_records=2.0, ledger=ledger) query = privacy_ledger.QueryWithLedger(query, ledger) # First sample. tf.assign(population_size, 10) tf.assign(selection_probability, 0.1) _, global_state = test_utils.run_query(query, [record1, record2]) expected_queries = [[10.0, 10.0], [0.5, 0.0]] formatted = ledger.get_formatted_ledger_eager() sample_1 = formatted[0] self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sample_1.queries, expected_queries) # Second sample. tf.assign(population_size, 20) tf.assign(selection_probability, 0.2) test_utils.run_query(query, [record1, record2], global_state) formatted = ledger.get_formatted_ledger_eager() sample_1, sample_2 = formatted self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sample_1.queries, expected_queries) expected_queries_2 = [[9.0, 9.0], [0.5, 0.0]] self.assertAllClose(sample_2.population_size, 20.0) self.assertAllClose(sample_2.selection_probability, 0.2) self.assertAllClose(sample_2.queries, expected_queries_2)
def test_nested_query(self): population_size = tf.Variable(0) selection_probability = tf.Variable(0.0) ledger = privacy_ledger.PrivacyLedger(population_size, selection_probability, 50, 50) query1 = gaussian_query.GaussianAverageQuery(l2_norm_clip=4.0, sum_stddev=2.0, denominator=5.0, ledger=ledger) query2 = gaussian_query.GaussianAverageQuery(l2_norm_clip=5.0, sum_stddev=1.0, denominator=5.0, ledger=ledger) query = nested_query.NestedQuery([query1, query2]) query = privacy_ledger.QueryWithLedger(query, ledger) record1 = [1.0, [12.0, 9.0]] record2 = [5.0, [1.0, 2.0]] # First sample. tf.assign(population_size, 10) tf.assign(selection_probability, 0.1) test_utils.run_query(query, [record1, record2]) expected_queries = [[4.0, 2.0], [5.0, 1.0]] formatted = ledger.get_formatted_ledger_eager() sample_1 = formatted[0] self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sorted(sample_1.queries), sorted(expected_queries)) # Second sample. tf.assign(population_size, 20) tf.assign(selection_probability, 0.2) test_utils.run_query(query, [record1, record2]) formatted = ledger.get_formatted_ledger_eager() sample_1, sample_2 = formatted self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sorted(sample_1.queries), sorted(expected_queries)) self.assertAllClose(sample_2.population_size, 20.0) self.assertAllClose(sample_2.selection_probability, 0.2) self.assertAllClose(sorted(sample_2.queries), sorted(expected_queries))
def linear_model_fn(features, labels, mode): preds = tf.keras.layers.Dense(1, activation='linear', name='dense').apply(features['x']) vector_loss = tf.squared_difference(labels, preds) scalar_loss = tf.reduce_mean(vector_loss) ledger = privacy_ledger.PrivacyLedger(1e6, 1 / 1e6, 500, 500) dp_average_query = gaussian_query.GaussianAverageQuery(1.0, 0.0, 1) dp_average_query = privacy_ledger.QueryWithLedger( dp_average_query, ledger) optimizer = dp_optimizer.DPGradientDescentOptimizer( dp_average_query, num_microbatches=1, learning_rate=1.0) global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=vector_loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op)
def testClippingNorm(self, cls): with self.cached_session() as sess: var0 = tf.Variable([0.0, 0.0]) data0 = tf.Variable([[3.0, 4.0], [6.0, 8.0]]) ledger = privacy_ledger.PrivacyLedger(1e6, 1 / 1e6, 50, 50) dp_average_query = gaussian_query.GaussianAverageQuery(1.0, 0.0, 1) dp_average_query = privacy_ledger.QueryWithLedger( dp_average_query, ledger) opt = cls(dp_average_query, num_microbatches=1, learning_rate=2.0) self.evaluate(tf.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([0.0, 0.0], self.evaluate(var0)) # Expected gradient is sum of differences. gradient_op = opt.compute_gradients(loss(data0, var0), [var0]) grads_and_vars = sess.run(gradient_op) self.assertAllCloseAccordingToType([-0.6, -0.8], grads_and_vars[0][0])
def test_sum_query(self): record1 = tf.constant([2.0, 0.0]) record2 = tf.constant([-1.0, 1.0]) population_size = tf.Variable(0) selection_probability = tf.Variable(0.0) ledger = privacy_ledger.PrivacyLedger(population_size, selection_probability, 50, 50) query = gaussian_query.GaussianSumQuery(l2_norm_clip=10.0, stddev=0.0, ledger=ledger) query = privacy_ledger.QueryWithLedger(query, ledger) # First sample. tf.assign(population_size, 10) tf.assign(selection_probability, 0.1) test_utils.run_query(query, [record1, record2]) expected_queries = [[10.0, 0.0]] formatted = ledger.get_formatted_ledger_eager() sample_1 = formatted[0] self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sample_1.queries, expected_queries) # Second sample. tf.assign(population_size, 20) tf.assign(selection_probability, 0.2) test_utils.run_query(query, [record1, record2]) formatted = ledger.get_formatted_ledger_eager() sample_1, sample_2 = formatted self.assertAllClose(sample_1.population_size, 10.0) self.assertAllClose(sample_1.selection_probability, 0.1) self.assertAllClose(sample_1.queries, expected_queries) self.assertAllClose(sample_2.population_size, 20.0) self.assertAllClose(sample_2.selection_probability, 0.2) self.assertAllClose(sample_2.queries, expected_queries)
def cnn_model_fn(features, labels): """Model function for a CNN.""" # Define CNN architecture using tf.keras.layers. if FLAGS.dataset == "mnist": input_layer = tf.reshape(features, [-1, 28, 28, 1]) elif FLAGS.dataset == "cifar10": input_layer = features # input_layer = tf.reshape(features, [-1, 32, 32, 3]) elif FLAGS.dataset == "svhn": input_layer = tf.reshape(features, [-1, 32, 32, 3]) # y = tf.keras.layers.Conv2D(16, 8, # strides=2, # padding='same', # activation='relu').apply(input_layer) # y = tf.keras.layers.MaxPool2D(2, 1).apply(y) # y = tf.keras.layers.Conv2D(32, 4, # strides=2, # padding='valid', # activation='relu').apply(y) # y = tf.keras.layers.MaxPool2D(2, 1).apply(y) # y = tf.keras.layers.Flatten().apply(y) # y = tf.keras.layers.Dense(32, activation='relu').apply(y) if FLAGS.model == "trival": logits = trival(input_layer=input_layer) elif FLAGS.model == "deep": logits = deep(input_layer=input_layer) # input_layer = tf.reshape(features, [-1, 32, 32, 3]) elif FLAGS.model == "letnet": logits = trival(input_layer=input_layer) # Calculate accuracy. correct_pred = tf.equal(tf.argmax(logits, 1), labels) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Calculate loss as a vector (to support microbatches in DP-SGD). vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) # Define mean of loss across minibatch (for reporting through tf.Estimator). scalar_loss = tf.reduce_mean(vector_loss) if FLAGS.dpsgd: ledger = privacy_ledger.PrivacyLedger( population_size=60000, selection_probability=(FLAGS.batch_size / 60000)) # Use DP version of GradientDescentOptimizer. Other optimizers are # available in dp_optimizer. Most optimizers inheriting from # tf.train.Optimizer should be wrappable in differentially private # counterparts by calling dp_optimizer.optimizer_from_args(). if FLAGS.method == 'sgd': optimizer = dp_optimizer.DPGradientDescentGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate) elif FLAGS.method == 'adam': optimizer = dp_optimizer.DPAdamGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate, unroll_microbatches=True) elif FLAGS.method == 'adagrad': optimizer = dp_optimizer.DPAdagradGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate) elif FLAGS.method == 'momentum': optimizer = dp_optimizer.DPMomentumGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate, momentum=FLAGS.momentum, use_nesterov=FLAGS.use_nesterov) else: raise ValueError( 'method must be sgd or adam or adagrad or momentum') opt_loss = vector_loss else: if FLAGS.method == 'sgd': optimizer = GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) elif FLAGS.method == 'adam': optimizer = AdamOptimizer(learning_rate=FLAGS.learning_rate) elif FLAGS.method == 'adagrad': optimizer = AdagradOptimizer(learning_rate=FLAGS.learning_rate) elif FLAGS.method == 'momentum': optimizer = MomentumOptimizer(learning_rate=FLAGS.learning_rate, momentum=FLAGS.momentum, use_nesterov=FLAGS.use_nesterov) else: raise ValueError( 'method must be sgd or adam or adagrad or momentum') opt_loss = scalar_loss global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) # In the following, we pass the mean of the loss (scalar_loss) rather than # the vector_loss because tf.estimator requires a scalar loss. This is only # used for evaluation and debugging by tf.estimator. The actual loss being # minimized is opt_loss defined above and passed to optimizer.minimize(). return train_op, scalar_loss, accuracy
def cnn_model_fn(features, labels, mode): """Model function for a CNN.""" # Define CNN architecture using tf.keras.layers. input_layer = tf.reshape(features['x'], [-1, 28, 28, 1]) y = tf.keras.layers.Conv2D(16, 8, strides=2, padding='same', activation='relu').apply(input_layer) y = tf.keras.layers.MaxPool2D(2, 1).apply(y) y = tf.keras.layers.Conv2D(32, 4, strides=2, padding='valid', activation='relu').apply(y) y = tf.keras.layers.MaxPool2D(2, 1).apply(y) y = tf.keras.layers.Flatten().apply(y) y = tf.keras.layers.Dense(32, activation='relu').apply(y) logits = tf.keras.layers.Dense(10).apply(y) # Calculate loss as a vector (to support microbatches in DP-SGD). vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) # Define mean of loss across minibatch (for reporting through tf.Estimator). scalar_loss = tf.reduce_mean(vector_loss) # Configure the training op (for TRAIN mode). if mode == tf.estimator.ModeKeys.TRAIN: if FLAGS.dpsgd: ledger = privacy_ledger.PrivacyLedger( population_size=60000, selection_probability=(FLAGS.batch_size / 60000)) # Use DP version of GradientDescentOptimizer. Other optimizers are # available in dp_optimizer. Most optimizers inheriting from # tf.train.Optimizer should be wrappable in differentially private # counterparts by calling dp_optimizer.optimizer_from_args(). optimizer = dp_optimizer.DPGradientDescentGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate) training_hooks = [ EpsilonPrintingTrainingHook(ledger) ] opt_loss = vector_loss else: optimizer = GradientDescentOptimizer(learning_rate=FLAGS.learning_rate) training_hooks = [] opt_loss = scalar_loss global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) # In the following, we pass the mean of the loss (scalar_loss) rather than # the vector_loss because tf.estimator requires a scalar loss. This is only # used for evaluation and debugging by tf.estimator. The actual loss being # minimized is opt_loss defined above and passed to optimizer.minimize(). return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op, training_hooks=training_hooks) # Add evaluation metrics (for EVAL mode). elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.accuracy( labels=labels, predictions=tf.argmax(input=logits, axis=1)) } return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)
def cnn_model_fn(features, labels, mode): """Model function for a CNN.""" # Define CNN architecture using tf.keras.layers. input_layer = tf.reshape(features['x'], [-1, 28, 28, 1]) y = tf.keras.layers.Conv2D(16, 8, strides=2, padding='same', activation='relu').apply(input_layer) y = tf.keras.layers.MaxPool2D(2, 1).apply(y) y = tf.keras.layers.Conv2D(32, 4, strides=2, padding='valid', activation='relu').apply(y) y = tf.keras.layers.MaxPool2D(2, 1).apply(y) y = tf.keras.layers.Flatten().apply(y) y = tf.keras.layers.Dense(32, activation='relu').apply(y) logits = tf.keras.layers.Dense(10).apply(y) # Calculate loss as a vector and as its average across minibatch. vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) scalar_loss = tf.reduce_mean(vector_loss) # Configure the training op (for TRAIN mode). if mode == tf.estimator.ModeKeys.TRAIN: # optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate) # opt_loss = scalar_loss # global_step = tf.train.get_global_step() # train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) ledger = privacy_ledger.PrivacyLedger( population_size=60000, selection_probability=(FLAGS.batch_size / 60000), max_samples=1e6, max_queries=1e6) optimizer = optimizers.dp_optimizer.DPGradientDescentGaussianOptimizer( l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate ) # population_size=60000 train_op = optimizer.minimize(loss=vector_loss) return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op) # Add evaluation metrics (for EVAL mode). elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.accuracy( labels=labels, predictions=tf.argmax(input=logits, axis=1)) } return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)
def __init__(self, sequence_length, num_classes, vocab_size, dis_emb_dim, d_rate, noise_multiplier, l2_norm_clip, population_size, delta, num_microbatches, filter_sizes, num_filters, batch_size, hidden_dim, start_token, goal_out_size, step_size, l2_reg_lambda=0.0): self.sequence_length = sequence_length self.num_classes = num_classes self.vocab_size = vocab_size self.dis_emb_dim = dis_emb_dim self.filter_sizes = filter_sizes self.num_filters = num_filters self.batch_size = batch_size self.hidden_dim = hidden_dim self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.l2_reg_lambda = l2_reg_lambda self.num_filters_total = sum(self.num_filters) self.temperature = 1.0 self.grad_clip = 5.0 #Does not apply to d_optimizer self.goal_out_size = goal_out_size self.step_size = step_size self.D_input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.D_input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.d_rate = d_rate self.l2_norm_clip = l2_norm_clip self.noise_multiplier = noise_multiplier self.num_microbatches = num_microbatches self.population_size = population_size self.delta = delta with tf.name_scope('D_update'): self.D_l2_loss = tf.constant(0.0) self.FeatureExtractor_unit = self.FeatureExtractor() # Train for Discriminator with tf.variable_scope("feature") as self.feature_scope: D_feature = self.FeatureExtractor_unit( self.D_input_x, self.dropout_keep_prob) #,self.dropout_keep_prob) self.feature_scope.reuse_variables() # tf.get_variable_scope().reuse_variables() D_scores, D_predictions, self.ypred_for_auc = self.classification( D_feature) losses = tf.nn.softmax_cross_entropy_with_logits( logits=D_scores, labels=self.D_input_y) self.D_loss = tf.reduce_mean( losses) + self.l2_reg_lambda * self.D_l2_loss self.D_params = [ param for param in tf.trainable_variables() if 'Discriminator' or 'FeatureExtractor' in param.name ] self.ledger = privacy_ledger.PrivacyLedger( population_size=self.population_size, selection_probability=(self.batch_size / self.population_size)) d_optimizer = dp_optimizer.DPAdamGaussianOptimizer( l2_norm_clip=self.l2_norm_clip, noise_multiplier=self.noise_multiplier, num_microbatches=self.num_microbatches, ledger=self.ledger, learning_rate=self.d_rate) D_grads_and_vars = d_optimizer.compute_gradients( self.D_loss, self.D_params, aggregation_method=2) self.D_train_op = d_optimizer.apply_gradients(D_grads_and_vars)
def test_fail_on_probability_zero(self): with self.assertRaisesRegexp(ValueError, 'Selection probability cannot be 0.'): privacy_ledger.PrivacyLedger(10, 0)
def __init__(self, sequence_length, num_classes, vocab_size, emb_dim, dis_emb_dim, noise_multiplier, l2_norm_clip, population_size, delta, num_microbatches, filter_sizes, num_filters, batch_size, hidden_dim, start_token, goal_out_size, goal_size, step_size, D_model, LSTMlayer_num=1, l2_reg_lambda=0.0, learning_rate=0.001): self.sequence_length = sequence_length self.num_classes = num_classes self.vocab_size = vocab_size self.emb_dim = emb_dim self.dis_emb_dim = dis_emb_dim self.noise_multiplier = noise_multiplier self.l2_norm_clip = l2_norm_clip self.population_size = population_size self.delta = delta self.num_microbatches = num_microbatches self.filter_sizes = filter_sizes self.num_filters = num_filters self.batch_size = batch_size self.hidden_dim = hidden_dim self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.LSTMlayer_num = LSTMlayer_num self.l2_reg_lambda = l2_reg_lambda self.learning_rate = learning_rate self.num_filters_total = sum(self.num_filters) self.grad_clip = 5.0 self.goal_out_size = goal_out_size self.goal_size = goal_size self.step_size = step_size self.D_model = D_model self.FeatureExtractor_unit = self.D_model.FeatureExtractor_unit self.scope = self.D_model.feature_scope self.worker_params = [] self.manager_params = [] self.epis = 0.65 self.tem = 0.8 with tf.variable_scope('place_holder'): self.x = tf.placeholder( tf.int32, shape=[self.batch_size, self.sequence_length ]) # sequence of tokens generated by generator self.reward = tf.placeholder( tf.float32, shape=[self.batch_size, self.sequence_length / self.step_size ]) # sequence of tokens generated by generator self.given_num = tf.placeholder(tf.int32) self.drop_out = tf.placeholder(tf.float32, name="dropout_keep_prob") self.train = tf.placeholder(tf.int32, None, name="train") with tf.variable_scope('Worker'): self.g_embeddings = tf.Variable( tf.random_normal([self.vocab_size, self.emb_dim], stddev=0.1)) self.worker_params.append(self.g_embeddings) self.g_worker_recurrent_unit = self.create_Worker_recurrent_unit( self.worker_params) # maps h_tm1 to h_t for generator self.g_worker_output_unit = self.create_Worker_output_unit( self.worker_params) # maps h_t to o_t (output token logits) self.W_workerOut_change = tf.Variable( tf.random_normal([self.vocab_size, self.goal_size], stddev=0.1)) self.g_change = tf.Variable( tf.random_normal([self.goal_out_size, self.goal_size], stddev=0.1)) self.worker_params.extend([self.W_workerOut_change, self.g_change]) self.h0_worker = tf.zeros([self.batch_size, self.hidden_dim]) self.h0_worker = tf.stack([self.h0_worker, self.h0_worker]) with tf.variable_scope('Manager'): self.g_manager_recurrent_unit = self.create_Manager_recurrent_unit( self.manager_params) # maps h_tm1 to h_t for generator self.g_manager_output_unit = self.create_Manager_output_unit( self.manager_params) # maps h_t to o_t (output token logits) self.h0_manager = tf.zeros([self.batch_size, self.hidden_dim]) self.h0_manager = tf.stack([self.h0_manager, self.h0_manager]) self.goal_init = tf.get_variable( "goal_init", initializer=tf.truncated_normal( [self.batch_size, self.goal_out_size], stddev=0.1)) self.manager_params.extend([self.goal_init]) self.padding_array = tf.constant( -1, shape=[self.batch_size, self.sequence_length], dtype=tf.int32) with tf.name_scope("roll_out"): self.gen_for_reward = self.rollout(self.x, self.given_num) # processed for batch with tf.device("/cpu:0"): self.processed_x = tf.transpose( tf.nn.embedding_lookup(self.g_embeddings, self.x), perm=[1, 0, 2]) # seq_length x batch_size x emb_dim gen_o = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) gen_x = tensor_array_ops.TensorArray(dtype=tf.int32, size=1, dynamic_size=True, infer_shape=True, clear_after_read=False) goal = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True, clear_after_read=False) feature_array = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length + 1, dynamic_size=False, infer_shape=True, clear_after_read=False) real_goal_array = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length / self.step_size, dynamic_size=False, infer_shape=True, clear_after_read=False) gen_real_goal_array = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True, clear_after_read=False) gen_o_worker_array = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length / self.step_size, dynamic_size=False, infer_shape=True, clear_after_read=False) def _g_recurrence(i, x_t, h_tm1, h_tm1_manager, gen_o, gen_x, goal, last_goal, real_goal, step_size, gen_real_goal_array, gen_o_worker_array): ## padding sentence by -1 cur_sen = tf.cond( i > 0, lambda: tf.split( tf.concat([ tf.transpose(gen_x.stack(), perm=[1, 0]), self. padding_array ], 1), [self.sequence_length, i], 1)[0], lambda: self.padding_array) with tf.variable_scope(self.scope): feature = self.FeatureExtractor_unit(cur_sen, self.drop_out) h_t_Worker = self.g_worker_recurrent_unit( x_t, h_tm1) # hidden_memory_tuple o_t_Worker = self.g_worker_output_unit( h_t_Worker) # batch x vocab , logits not prob o_t_Worker = tf.reshape( o_t_Worker, [self.batch_size, self.vocab_size, self.goal_size]) h_t_manager = self.g_manager_recurrent_unit(feature, h_tm1_manager) sub_goal = self.g_manager_output_unit(h_t_manager) sub_goal = tf.nn.l2_normalize(sub_goal, 1) goal = goal.write(i, sub_goal) real_sub_goal = tf.add(last_goal, sub_goal) w_g = tf.matmul(real_goal, self.g_change) #batch x goal_size w_g = tf.nn.l2_normalize(w_g, 1) gen_real_goal_array = gen_real_goal_array.write(i, real_goal) w_g = tf.expand_dims(w_g, 2) #batch x goal_size x 1 gen_o_worker_array = gen_o_worker_array.write(i, o_t_Worker) x_logits = tf.matmul(o_t_Worker, w_g) x_logits = tf.squeeze(x_logits) log_prob = tf.log( tf.nn.softmax( tf.cond( i > 1, lambda: tf.cond(self.train > 0, lambda: self. tem, lambda: 1.5), lambda: 1.5) * x_logits)) next_token = tf.cast( tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) # batch x emb_dim with tf.control_dependencies([cur_sen]): gen_x = gen_x.write(i, next_token) # indices, batch_size gen_o = gen_o.write(i, tf.reduce_sum( tf.multiply( tf.one_hot(next_token, self.vocab_size, 1.0, 0.0), tf.nn.softmax(x_logits)), 1)) # [batch_size] , prob return i+1,x_tp1,h_t_Worker,h_t_manager,gen_o,gen_x,goal,\ tf.cond(((i+1)%step_size)>0,lambda:real_sub_goal,lambda :tf.constant(0.0,shape=[self.batch_size,self.goal_out_size]))\ ,tf.cond(((i+1)%step_size)>0,lambda :real_goal,lambda :real_sub_goal),step_size,gen_real_goal_array,gen_o_worker_array _, _, _, _, self.gen_o, self.gen_x, _, _, _, _, self.gen_real_goal_array, self.gen_o_worker_array = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11: i < self.sequence_length, body=_g_recurrence, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.g_embeddings, self.start_token), self.h0_worker, self.h0_manager, gen_o, gen_x, goal, tf.zeros([self.batch_size, self.goal_out_size]), self.goal_init, step_size, gen_real_goal_array, gen_o_worker_array), parallel_iterations=1) self.gen_x = self.gen_x.stack() # seq_length x batch_size self.gen_x = tf.transpose(self.gen_x, perm=[1, 0]) # batch_size x seq_length self.gen_real_goal_array = self.gen_real_goal_array.stack( ) # seq_length x batch_size x goal self.gen_real_goal_array = tf.transpose( self.gen_real_goal_array, perm=[1, 0, 2]) # batch_size x seq_length x goal self.gen_o_worker_array = self.gen_o_worker_array.stack( ) # seq_length x batch_size* vocab*goal self.gen_o_worker_array = tf.transpose( self.gen_o_worker_array, perm=[1, 0, 2, 3]) # batch_size x seq_length * vocab*goal sub_feature = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length / self.step_size, dynamic_size=False, infer_shape=True, clear_after_read=False) all_sub_features = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True, clear_after_read=False) all_sub_goals = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True, clear_after_read=False) # supervised pretraining for generator g_predictions = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length) ta_emb_x = ta_emb_x.unstack(self.processed_x) def preTrain(i, x_t, g_predictions, h_tm1, input_x, h_tm1_manager, last_goal, real_goal, feature_array, real_goal_array, sub_feature, all_sub_features, all_sub_goals): ## padding sentence by -1 cur_sen = tf.split( tf.concat([ tf.split(input_x, [i, self.sequence_length - i], 1)[0], self.padding_array ], 1), [self.sequence_length, i], 1)[0] #padding sentence with tf.variable_scope(self.scope): feature = self.FeatureExtractor_unit(cur_sen, self.drop_out) feature_array = feature_array.write(i, feature) real_goal_array = tf.cond( i > 0, lambda: real_goal_array, lambda: real_goal_array.write(0, self.goal_init)) h_t_manager = self.g_manager_recurrent_unit(feature, h_tm1_manager) sub_goal = self.g_manager_output_unit(h_t_manager) sub_goal = tf.nn.l2_normalize(sub_goal, 1) h_t_Worker = tf.cond( i > 0, lambda: self.g_worker_recurrent_unit(x_t, h_tm1), lambda: h_tm1) # hidden_memory_tuple o_t_Worker = self.g_worker_output_unit( h_t_Worker) # batch x vocab , logits not prob o_t_Worker = tf.reshape( o_t_Worker, [self.batch_size, self.vocab_size, self.goal_size]) real_sub_goal = tf.cond(i > 0, lambda: tf.add(last_goal, sub_goal), lambda: real_goal) all_sub_goals = tf.cond( i > 0, lambda: all_sub_goals.write(i - 1, real_goal), lambda: all_sub_goals) w_g = tf.matmul(real_goal, self.g_change) # batch x goal_size w_g = tf.nn.l2_normalize(w_g, 1) w_g = tf.expand_dims(w_g, 2) # batch x goal_size x 1 x_logits = tf.matmul(o_t_Worker, w_g) x_logits = tf.squeeze(x_logits) g_predictions = tf.cond( i > 0, lambda: g_predictions.write(i - 1, tf.nn.softmax(x_logits)), lambda: g_predictions) sub_feature = tf.cond( ((((i) % step_size) > 0)), lambda: sub_feature, lambda: (tf.cond( i > 0, lambda: sub_feature.write( i / step_size - 1, tf.subtract(feature, feature_array.read(i - step_size)) ), lambda: sub_feature))) all_sub_features = tf.cond(i > 0,lambda: tf.cond((i % step_size) > 0, lambda :all_sub_features.write(i-1,tf.subtract(feature,feature_array.read(i-i%step_size))),\ lambda :all_sub_features.write(i-1,tf.subtract(feature,feature_array.read(i-step_size)))), lambda : all_sub_features) real_goal_array = tf.cond( ((i) % step_size) > 0, lambda: real_goal_array, lambda: tf.cond( (i) / step_size < self.sequence_length / step_size, lambda: tf.cond( i > 0, lambda: real_goal_array.write( (i) / step_size, real_sub_goal), lambda: real_goal_array), lambda: real_goal_array)) x_tp1 = tf.cond(i > 0, lambda: ta_emb_x.read(i - 1), lambda: x_t) return i+1, x_tp1, g_predictions, h_t_Worker, input_x, h_t_manager,\ tf.cond(((i)%step_size)>0,lambda:real_sub_goal,lambda :tf.constant(0.0,shape=[self.batch_size,self.goal_out_size])) ,\ tf.cond(((i) % step_size) > 0, lambda: real_goal, lambda: real_sub_goal),\ feature_array,real_goal_array,sub_feature,all_sub_features,all_sub_goals _, _, self.g_predictions, _, _, _, _, _, self.feature_array, self.real_goal_array, self.sub_feature, self.all_sub_features, self.all_sub_goals = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12: i < self.sequence_length + 1, body=preTrain, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.g_embeddings, self.start_token), g_predictions, self.h0_worker, self.x, self.h0_manager, tf.zeros([self.batch_size, self.goal_out_size]), self.goal_init, feature_array, real_goal_array, sub_feature, all_sub_features, all_sub_goals), parallel_iterations=1) self.sub_feature = self.sub_feature.stack( ) # seq_length x batch_size x num_filter self.sub_feature = tf.transpose(self.sub_feature, perm=[1, 0, 2]) self.real_goal_array = self.real_goal_array.stack() self.real_goal_array = tf.transpose(self.real_goal_array, perm=[1, 0, 2]) print self.real_goal_array.shape print self.sub_feature.shape self.pretrain_goal_loss = -tf.reduce_sum(1 - tf.losses.cosine_distance( tf.nn.l2_normalize(self.sub_feature, 2), tf.nn.l2_normalize(self.real_goal_array, 2), 2)) / ( self.sequence_length * self.batch_size / self.step_size) with tf.name_scope("Manager_PreTrain_update"): pretrain_manager_opt = tf.train.AdamOptimizer(self.learning_rate) self.pretrain_manager_grad, _ = tf.clip_by_global_norm( tf.gradients(self.pretrain_goal_loss, self.manager_params), self.grad_clip) self.pretrain_manager_updates = pretrain_manager_opt.apply_gradients( zip(self.pretrain_manager_grad, self.manager_params)) # self.real_goal_array = self.real_goal_array.stack() self.g_predictions = tf.transpose( self.g_predictions.stack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size self.cross_entropy = tf.reduce_sum(self.g_predictions * tf.log( tf.clip_by_value(self.g_predictions, 1e-20, 1.0))) / ( self.batch_size * self.sequence_length * self.vocab_size) self.pretrain_worker_loss = -tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.vocab_size, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.vocab_size]), 1e-20, 1.0))) / (self.sequence_length * self.batch_size) with tf.name_scope("Worker_PreTrain_update"): # training updates self.worker_pre_ledger = privacy_ledger.PrivacyLedger( population_size=self.population_size, selection_probability=(self.batch_size / self.population_size)) pretrain_worker_opt = dp_optimizer.DPAdamGaussianOptimizer( l2_norm_clip=self.l2_norm_clip, noise_multiplier=self.noise_multiplier, num_microbatches=self.num_microbatches, ledger=self.worker_pre_ledger, learning_rate=self.learning_rate) self.pretrain_worker_grad, _ = tf.clip_by_global_norm( tf.gradients(self.pretrain_worker_loss, self.worker_params), self.grad_clip) self.pretrain_worker_updates = pretrain_worker_opt.apply_gradients( zip(self.pretrain_worker_grad, self.worker_params)) self.goal_loss = -tf.reduce_sum( tf.multiply( self.reward, 1 - tf.losses.cosine_distance( tf.nn.l2_normalize(self.sub_feature, 2), tf.nn.l2_normalize(self.real_goal_array, 2), 2))) / ( self.sequence_length * self.batch_size / self.step_size) with tf.name_scope("Manager_update"): manager_opt = tf.train.AdamOptimizer(self.learning_rate) self.manager_grad, _ = tf.clip_by_global_norm( tf.gradients(self.goal_loss, self.manager_params), self.grad_clip) self.manager_updates = manager_opt.apply_gradients( zip(self.manager_grad, self.manager_params)) self.all_sub_features = self.all_sub_features.stack() self.all_sub_features = tf.transpose(self.all_sub_features, perm=[1, 0, 2]) self.all_sub_goals = self.all_sub_goals.stack() self.all_sub_goals = tf.transpose(self.all_sub_goals, perm=[1, 0, 2]) # self.all_sub_features = tf.nn.l2_normalize(self.all_sub_features, 2) self.Worker_Reward = 1 - tf.losses.cosine_distance( tf.nn.l2_normalize(self.all_sub_features, 2), tf.nn.l2_normalize(self.all_sub_goals, 2), 2) # print self.Worker_Reward.shape self.worker_loss = -tf.reduce_sum( tf.multiply( self.Worker_Reward, tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.vocab_size, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.vocab_size]), 1e-20, 1.0)))) / (self.sequence_length * self.batch_size) with tf.name_scope("Worker_update"): # training updates worker_opt = tf.train.AdamOptimizer(self.learning_rate) self.worker_grad, _ = tf.clip_by_global_norm( tf.gradients(self.worker_loss, self.worker_params), self.grad_clip) self.worker_updates = worker_opt.apply_gradients( zip(self.worker_grad, self.worker_params))
D_loss = D_loss_real + D_loss_fake vector_G_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=D_fake_logits, labels=tf.ones([batch_size, 1, 1, 1])) G_loss = tf.reduce_mean(vector_G_loss) # trainable variables for each network T_vars = tf.trainable_variables() D_vars = [var for var in T_vars if var.name.startswith('discriminator')] G_vars = [var for var in T_vars if var.name.startswith('generator')] # In[11]: ledger = privacy_ledger.PrivacyLedger(population_size=55000, selection_probability=(batch_size / 55000), max_samples=1e6, max_queries=1e6) G_optimizer = dp_optimizer.DPAdamGaussianOptimizer( l2_norm_clip=l2_norm_clip, noise_multiplier=noise_multiplier, num_microbatches=num_microbatches, learning_rate=lr, beta1=0.5, ledger=ledger) # In[12]: # optimizer for each network with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
def generate_estimator_spec(logits, features, labels, mode): if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'probabilities': tf.nn.softmax(logits), 'logits': logits, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) # Calculate loss as a vector (to support microbatches in DP-SGD). vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) # Define mean of loss across minibatch (for reporting through tf.Estimator). scalar_loss = tf.reduce_mean(vector_loss) # Configure the training op (for TRAIN mode). if mode == tf.estimator.ModeKeys.TRAIN: if FLAGS.dp: ledger = privacy_ledger.PrivacyLedger( population_size=60000, selection_probability=(FLAGS.batch_size / 60000)) # Use DP version of GradientDescentOptimizer. Other optimizers are # available in dp_optimizer. Most optimizers inheriting from # tf.train.Optimizer should be wrappable in differentially private # counterparts by calling dp_optimizer.optimizer_from_args(). if FLAGS.optim == 'sgd': optimizer_func = dp_optimizer.DPGradientDescentGaussianOptimizer elif FLAGS.optim == 'adam': optimizer_func = dp_optimizer.DPAdamGaussianOptimizer elif FLAGS.optim == 'adagrad': optimizer_func = dp_optimizer.DPAdagradGaussianOptimizer else: raise ValueError("optimizer function not supported") optimizer = optimizer_func(l2_norm_clip=FLAGS.l2_norm_clip, noise_multiplier=FLAGS.noise_multiplier, num_microbatches=FLAGS.microbatches, ledger=ledger, learning_rate=FLAGS.learning_rate) training_hooks = [EpsilonPrintingTrainingHook(ledger)] opt_loss = vector_loss else: if FLAGS.optim == 'sgd': optimizer_func = GradientDescentOptimizer elif FLAGS.optim == 'adam': optimizer_func = AdamOptimizer elif FLAGS.optim == 'adagrad': optimizer_func = AdagradOptimizer else: raise ValueError("optimizer function not supported") optimizer = GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) training_hooks = [] opt_loss = scalar_loss global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) # In the following, we pass the mean of the loss (scalar_loss) rather than # the vector_loss because tf.estimator requires a scalar loss. This is only # used for evaluation and debugging by tf.estimator. The actual loss being # minimized is opt_loss defined above and passed to optimizer.minimize(). return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, train_op=train_op, training_hooks=training_hooks) # Add evaluation metrics (for EVAL mode). elif mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=tf.argmax(input=logits, axis=1)), 'crossentropy': tf.metrics.mean(scalar_loss) } return tf.estimator.EstimatorSpec(mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)