def test_saving_restoring_checkpoint(self): logdir = _test_dir(self.get_temp_dir(), "test_saving_restoring_checkpoint") with ops.Graph().as_default(): gstep = training_util.create_global_step() do_step = state_ops.assign_add(gstep, 1) v0 = variables.Variable(10.0, name="v0") v1 = variables.Variable(20.0, name="v1") target_values = [[0.0], [1.0], [2.0]] keys = array_ops.placeholder(dtypes.int64) values = constant_op.constant(target_values, dtypes.float32) table = de.Variable( key_dtype=dtypes.int64, value_dtype=dtypes.float32, initializer=-1.0, name="m100", dim=1, ) upsert_op = table.upsert(keys, values) lookup_op = table.lookup(keys) size_op = table.size() with monitored_session.MonitoredTrainingSession( config=default_config, is_chief=True, checkpoint_dir=logdir) as sess: self.assertEqual(0, sess.run(gstep)) self.assertEqual(1, sess.run(do_step)) self.assertEqual(2, sess.run(do_step)) # Check that the parameter nodes have been initialized. self.assertEqual(10.0, sess.run(v0)) self.assertEqual(20.0, sess.run(v1)) self.assertAllEqual(0, sess.run(size_op)) sess.run(upsert_op, feed_dict={keys: [0, 1, 2]}) self.assertAllEqual(3, sess.run(size_op)) self.device_check(table) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredTrainingSession( config=default_config, is_chief=True, checkpoint_dir=logdir) as sess: self.assertEqual(2, sess.run(gstep)) self.assertAllEqual(3, sess.run(table.size())) self.assertAllEqual( target_values, sess.run(lookup_op, feed_dict={keys: [0, 1, 2]})) self.device_check(table)
def _test_logits(self, mode, hidden_units, logits_dimension, inputs, expected_logits): """Tests that the expected logits are passed to mock head.""" with ops.Graph().as_default(): training_util.create_global_step() head = _mock_head(self, hidden_units=hidden_units, logits_dimension=logits_dimension, expected_logits=expected_logits) estimator_spec = dnn._dnn_model_fn( features={'age': constant_op.constant(inputs)}, labels=constant_op.constant([[1]]), mode=mode, head=head, hidden_units=hidden_units, feature_columns=[ feature_column.numeric_column( 'age', shape=np.array(inputs).shape[1:]) ], optimizer=_mock_optimizer(self, hidden_units)) with monitored_session.MonitoredTrainingSession( checkpoint_dir=self._model_dir) as sess: if mode == model_fn.ModeKeys.TRAIN: sess.run(estimator_spec.train_op) elif mode == model_fn.ModeKeys.EVAL: sess.run(estimator_spec.loss) elif mode == model_fn.ModeKeys.PREDICT: sess.run(estimator_spec.predictions) else: self.fail('Invalid mode: {}'.format(mode))
def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases, global_step, model_dir): """Create checkpoint file with provided model weights. Args: rnn_weights: Iterable of values of weights for the RNN cell. rnn_biases: Iterable of values of biases for the RNN cell. logits_weights: Iterable of values for matrix connecting RNN output to logits. logits_biases: Iterable of values for logits bias term. global_step: Initial global step to save in checkpoint. model_dir: Directory into which checkpoint is saved. """ model_weights = {} model_weights[CELL_WEIGHTS_NAME] = rnn_weights model_weights[CELL_BIAS_NAME] = rnn_biases model_weights[LOGITS_WEIGHTS_NAME] = logits_weights model_weights[LOGITS_BIAS_NAME] = logits_biases with ops.Graph().as_default(): # Create model variables. for k, v in six.iteritems(model_weights): variables_lib.Variable(v, name=k, dtype=dtypes.float32) # Create non-model variables. global_step_var = training_util.create_global_step() assign_op = global_step_var.assign(global_step) # Initialize vars and save checkpoint. with monitored_session.MonitoredTrainingSession( checkpoint_dir=model_dir) as sess: sess.run(assign_op)
def _test_logits(self, mode, rnn_units, logits_dimension, features_fn, sequence_feature_columns, context_feature_columns, expected_logits): """Tests that the expected logits are calculated.""" with ops.Graph().as_default(): # Global step needed for MonitoredSession, which is in turn used to # explicitly set variable weights through a checkpoint. training_util.create_global_step() # Use a variable scope here with 'rnn', emulating the rnn model_fn, so # the checkpoint naming is shared. with variable_scope.variable_scope('rnn'): input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=0, min_slice_size=64 << 20)) logit_fn = rnn._rnn_logit_fn_builder( output_units=logits_dimension, rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units), sequence_feature_columns=sequence_feature_columns, context_feature_columns=context_feature_columns, input_layer_partitioner=input_layer_partitioner) # Features are constructed within this function, otherwise the Tensors # containing the features would be defined outside this graph. logits = logit_fn(features=features_fn(), mode=mode) with monitored_session.MonitoredTrainingSession( checkpoint_dir=self._model_dir) as sess: self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
def _test_logits_helper(self, mode): """Tests that the expected logits are passed to mock head.""" with ops.Graph().as_default(): training_util.get_or_create_global_step() generator_inputs = {'x': array_ops.zeros([5, 4])} real_data = (None if mode == model_fn_lib.ModeKeys.PREDICT else array_ops.zeros([5, 4])) generator_scope_name = 'generator' head = mock_head(self, expected_generator_inputs=generator_inputs, expected_real_data=real_data, generator_scope_name=generator_scope_name) estimator_spec = estimator._gan_model_fn( features=generator_inputs, labels=real_data, mode=mode, generator_fn=generator_fn, discriminator_fn=discriminator_fn, generator_scope_name=generator_scope_name, head=head) with monitored_session.MonitoredTrainingSession( checkpoint_dir=self._model_dir) as sess: if mode == model_fn_lib.ModeKeys.TRAIN: sess.run(estimator_spec.train_op) elif mode == model_fn_lib.ModeKeys.EVAL: sess.run(estimator_spec.loss) elif mode == model_fn_lib.ModeKeys.PREDICT: sess.run(estimator_spec.predictions) else: self.fail('Invalid mode: {}'.format(mode))
def test_checkpoint_overwrite_warm_start(self): extra_run_step = 2 ws_ckpt_dir = tempfile.mkdtemp( prefix=os.path.join(self.get_temp_dir(), "warm_start")) final_ckpt_dir = tempfile.mkdtemp( prefix=os.path.join(self.get_temp_dir(), "final")) for run_id, num_shards, k_dtype, d_dtype, init_mode, dim, run_step \ in _next_run_step_config(): error_msg = "Cond:{},{},{},{},{},{}".format( num_shards, k_dtype, d_dtype, init_mode, dim, run_step) with ops.Graph().as_default() as g: with self.session(graph=g, use_gpu=test_util.is_gpu_available(), config=default_config) as sess: training_util.create_global_step() graph = TestGraph(k_dtype, d_dtype, dim, num_shards, 'var', 'devar', run_id) self.evaluate(variables.global_variables_initializer()) sess.run([graph.devar_init_op]) prev_x = sess.run([graph.x])[0] for _ in range(run_step): sess.run([graph.var_opt_op, graph.devar_opt_op]) saver_lib.Saver().save(sess, os.path.join(ws_ckpt_dir, "model")) prev_ws_var_loss, prev_ws_devar_loss = sess.run( [graph.var_loss, graph.devar_loss]) self.assertAllCloseAccordingToType(prev_ws_var_loss, prev_ws_devar_loss, msg=error_msg) for _ in range(extra_run_step): sess.run([graph.var_opt_op, graph.devar_opt_op]) saver_lib.Saver().save( sess, os.path.join(final_ckpt_dir, "model")) prev_final_var_loss, prev_final_devar_loss = sess.run( [graph.var_loss, graph.devar_loss]) self.assertAllCloseAccordingToType(prev_final_var_loss, prev_final_devar_loss, msg=error_msg) with ops.Graph().as_default(): training_util.create_global_step() graph = TestGraph(k_dtype, d_dtype, dim, num_shards, 'var', 'devar', run_id, prev_x) ws_util.warm_start(ws_ckpt_dir, vars_to_warm_start=['.*']) with monitored_session.MonitoredTrainingSession( config=default_config, is_chief=True, checkpoint_dir=final_ckpt_dir) as sess: var_loss, devar_loss = sess.run( [graph.var_loss, graph.devar_loss]) self.assertAllCloseAccordingToType(var_loss, prev_final_var_loss, msg=error_msg) self.assertAllCloseAccordingToType(devar_loss, prev_final_devar_loss, msg=error_msg)
def common_minimize_trainable(self, base_opt, test_opt, name): tf.config.set_soft_device_placement(True) hvd.init() base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True) for dtype, run_step, dim in itertools.product([dtypes.float32], [1], [10]): x = tf.random.uniform(shape=[32, dim]) y = tf.zeros([32, 1]) global_step = training_util.create_global_step() base_weight = tf.compat.v1.get_variable(name="base_weights", initializer=tf.ones( [10, 1])) base_logits = tf.nn.relu(math_ops.matmul(x, base_weight)) base_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=base_logits) base_opt_op = base_opt.minimize(base_loss, global_step, var_list=[base_weight]) test_weight = tf.compat.v1.get_variable(name="test_weights", initializer=tf.ones( [10, 1])) test_logits = tf.nn.relu(math_ops.matmul(x, test_weight)) test_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=test_logits) grads_and_vars = test_opt.compute_gradients(test_loss, var_list=[test_weight]) var_list = [] aggregated_grad = [] for grad, var in grads_and_vars: var_list.append(var) aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars, global_step) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: for _ in range(run_step): sess.run(base_opt_op) sess.run(test_opt_op) self.assertAllCloseAccordingToType( sess.run(base_weight), sess.run(test_weight), msg="Cond:{},{},{}".format(dtype, run_step, dim), )
def test_multi_feature_column_multi_dim_logits(self): """Tests multiple feature columns and multi-dimensional logits. All numbers are the same as test_multi_dim_input_multi_dim_logits. The only difference is that the input consists of two 1D feature columns, instead of one 2D feature column. """ base_global_step = 100 _create_checkpoint(( ([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]), ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]), ), base_global_step, self._model_dir) hidden_units = (2, 2) logits_dimension = 3 inputs = ([[10.]], [[8.]]) expected_logits = [[-0.48, 0.48, 0.39]] for mode in [ model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL, model_fn.ModeKeys.PREDICT ]: with ops.Graph().as_default(): training_util.create_global_step() head = _mock_head(self, hidden_units=hidden_units, logits_dimension=logits_dimension, expected_logits=expected_logits) estimator_spec = dnn._dnn_model_fn( features={ 'age': constant_op.constant(inputs[0]), 'height': constant_op.constant(inputs[1]) }, labels=constant_op.constant([[1]]), mode=mode, head=head, hidden_units=hidden_units, feature_columns=[ feature_column.numeric_column('age'), feature_column.numeric_column('height') ], optimizer=_mock_optimizer(self, hidden_units)) with monitored_session.MonitoredTrainingSession( checkpoint_dir=self._model_dir) as sess: if mode == model_fn.ModeKeys.TRAIN: sess.run(estimator_spec.train_op) elif mode == model_fn.ModeKeys.EVAL: sess.run(estimator_spec.loss) elif mode == model_fn.ModeKeys.PREDICT: sess.run(estimator_spec.predictions) else: self.fail('Invalid mode: {}'.format(mode))
def test_reads_before_increments(self): with ops.Graph().as_default(): training_util.create_global_step() read_tensor = training_util._get_or_create_global_step_read() inc_op = training_util._increment_global_step(1) inc_three_op = training_util._increment_global_step(3) with monitored_session.MonitoredTrainingSession() as sess: read_value, _ = sess.run([read_tensor, inc_op]) self.assertEqual(0, read_value) read_value, _ = sess.run([read_tensor, inc_three_op]) self.assertEqual(1, read_value) read_value = sess.run(read_tensor) self.assertEqual(4, read_value)
def test_inference_numberic_correctness(self): train_pred = None infer_pred = None dim = 8 initializer = init_ops.random_normal_initializer(0.0, 0.001) raw_init_vals = np.random.rand(100, dim) for fn in [de.enable_train_mode, de.enable_inference_mode]: with ops.Graph().as_default(): fn() init_ids = constant_op.constant(list(range(100)), dtype=dtypes.int64) init_vals = constant_op.constant(raw_init_vals, dtype=dtypes.float32) with variable_scope.variable_scope("modelmode", reuse=variable_scope.AUTO_REUSE): embeddings = de.get_variable('ModelModeTest-numberic', key_dtype=dtypes.int64, value_dtype=dtypes.float32, devices=_get_devices() * 2, initializer=initializer, dim=dim) w = variables.Variable(1.0, name="w") _ = training_util.create_global_step() init_op = embeddings.upsert(init_ids, init_vals) ids = constant_op.constant([0, 1, 2, 3, 4], dtype=dtypes.int64) test_var, trainable = de.embedding_lookup([embeddings], ids, return_trainable=True) pred = math_ops.add(test_var, 1) * w loss = pred * pred opt = de.DynamicEmbeddingOptimizer(adagrad.AdagradOptimizer(0.1)) opt.minimize(loss) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: if de.get_model_mode() == de.ModelMode.TRAIN: sess.run(init_op) train_pred = sess.run(pred) elif de.get_model_mode() == de.ModelMode.INFERENCE: sess.run(init_op) infer_pred = sess.run(pred) de.enable_train_mode() ops.reset_default_graph() self.assertAllEqual(train_pred, infer_pred)
def testTrainingLoop(self): random_seed.set_random_seed(1) # Model with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): x = array_ops.placeholder(np.float32, [4, 1, 4], name="a") l = array_ops.placeholder(np.float32, [4, 1, 1], name="b") y = layers.dense(x, 1, activation=nn.sigmoid) loss = losses.log_loss(l, y) train_op = gradient_descent.GradientDescentOptimizer(0.1) \ .minimize(loss) init = variables.global_variables_initializer() # Test data image_data = [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]], [[4, 4, 4, 4]]] label_data = [[[1]], [[2]], [[3]], [[4]]] # Run training. with ms.MonitoredTrainingSession(is_chief=True, chief_only_hooks=None, save_summaries_steps=None, save_summaries_secs=None) as sess: sess.run(init) previous_loss = float("inf") for _ in range(5): measured_loss, _ = sess.run([loss, train_op], feed_dict={ x: image_data, l: label_data }) self.assertTrue(measured_loss < previous_loss) previous_loss = measured_loss
def _train_model(self, checkpoint_dir, num_steps): """Trains a simple classification model. Note that the data has been configured such that after around 300 steps, the model has memorized the dataset (e.g. we can expect %100 accuracy). Args: checkpoint_dir: The directory where the checkpoint is written to. num_steps: The number of steps to train for. """ with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss_op = losses.log_loss(labels=tf_labels, predictions=tf_predictions) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = optimizer.minimize(loss_op, training.get_or_create_global_step()) with monitored_session.MonitoredTrainingSession( checkpoint_dir=checkpoint_dir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps) ]) as session: loss = None while not session.should_stop(): _, loss = session.run([train_op, loss_op]) if num_steps >= 300: assert loss < .015
def common_minimize_trainable(self, base_opt, test_opt, name): from tensorflow.python.framework.errors_impl import NotFoundError # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+. try: import horovod.tensorflow as hvd except NotFoundError: self.skipTest( "Skip the test for horovod import error with Tensorflow-2.7.0 on MacOS-12." ) tf.config.set_soft_device_placement(True) hvd.init() base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True) for dtype, run_step, dim in itertools.product([dtypes.float32], [1], [10]): x = tf.random.uniform(shape=[32, dim]) y = tf.zeros([32, 1]) global_step = training_util.create_global_step() base_weight = tf.compat.v1.get_variable(name="base_weights", initializer=tf.ones( [10, 1])) base_logits = tf.nn.relu(math_ops.matmul(x, base_weight)) base_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=base_logits) base_opt_op = base_opt.minimize(base_loss, global_step, var_list=[base_weight]) test_weight = tf.compat.v1.get_variable(name="test_weights", initializer=tf.ones( [10, 1])) test_logits = tf.nn.relu(math_ops.matmul(x, test_weight)) test_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=y, logits=test_logits) grads_and_vars = test_opt.compute_gradients(test_loss, var_list=[test_weight]) var_list = [] aggregated_grad = [] for grad, var in grads_and_vars: var_list.append(var) aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars, global_step) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: for _ in range(run_step): sess.run(base_opt_op) sess.run(test_opt_op) self.assertAllCloseAccordingToType( sess.run(base_weight), sess.run(test_weight), msg="Cond:{},{},{}".format(dtype, run_step, dim), )
def _train_model(self, env, first_update, update_frequency, hooks): all_hooks = [] self._graph = ops.Graph() with self._graph.as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = training.get_or_create_global_step(g) global_episode = get_or_create_global_episode(g) global_timestep = get_or_create_global_timestep(g) update_episode_op = tf.assign_add(global_episode, 1) update_timestep_op = tf.assign_add(global_timestep, 1) no_run_hooks = tf.no_op(name='no_run_hooks') with ops.device('/cpu:0'): features, labels = self._prepare_input_fn(Modes.TRAIN, env) estimator_spec = self._call_model_fn(features, labels, Modes.TRAIN) ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss) all_hooks.extend([ plx_hooks.NanTensorHook(estimator_spec.loss), plx_hooks.StepLoggingTensorHook( { 'loss': estimator_spec.loss, 'step': global_step, 'timestep': global_timestep, 'global_episode': global_episode, 'max_reward': labels['max_reward'], 'min_reward': labels['min_reward'], 'total_reward': labels['total_reward'], }, every_n_iter=100) ]) all_hooks.extend(hooks) all_hooks.extend(estimator_spec.training_hooks) scaffold = estimator_spec.scaffold or monitored_session.Scaffold() if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, # TODO remove non restorable vars saver.Saver( sharded=True, # TODO `var_list` max_to_keep=self._config.keep_checkpoint_max, defer_build=True)) chief_hooks = [ plx_hooks.EpisodeLoggingTensorHook( { 'loss': estimator_spec.loss, 'step': global_step, 'global_timestep': global_timestep, 'global_episode': global_episode, 'max_reward': labels['max_reward'], 'min_reward': labels['min_reward'], 'total_reward': labels['total_reward'], }, every_n_episodes=1), # TODO: save every episode? plx_hooks.EpisodeCounterHook(output_dir=self.model_dir) ] if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps: saver_hook_exists = any([ isinstance(h, plx_hooks.EpisodeCheckpointSaverHook) for h in (all_hooks + chief_hooks + list(estimator_spec.training_chief_hooks)) ]) if not saver_hook_exists: chief_hooks += [ plx_hooks.EpisodeCheckpointSaverHook( self._model_dir, save_episodes=1, # TODO: save every episode? scaffold=scaffold) ] if self._config.save_summary_steps: saver_hook_exists = any([ isinstance(h, plx_hooks.EpisodeSummarySaverHook) for h in (all_hooks + chief_hooks + list(estimator_spec.training_chief_hooks)) ]) if not saver_hook_exists: chief_hooks += [ plx_hooks.EpisodeSummarySaverHook( scaffold=scaffold, save_episodes=1, # TODO: save every episode? output_dir=self._model_dir, ) ] with monitored_session.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=scaffold, hooks=all_hooks, chief_only_hooks=chief_hooks + list(estimator_spec.training_chief_hooks), save_checkpoint_secs= 0, # Saving checkpoint is handled by a hook. save_summaries_steps= 0, # Saving summaries is handled by a hook. config=self._session_config) as mon_sess: loss = None while not mon_sess.should_stop(): loss = self.run_episode( env=env, sess=mon_sess, features=features, labels=labels, no_run_hooks=no_run_hooks, global_step=global_step, update_episode_op=update_episode_op, update_timestep_op=update_timestep_op, first_update=first_update, update_frequency=update_frequency, estimator_spec=estimator_spec) summary_io.SummaryWriterCache.clear() return loss
def common_minimize_trainable(self, base_opt, test_opt, name): base_opt = de.DynamicEmbeddingOptimizer(base_opt) test_opt = de.DynamicEmbeddingOptimizer(test_opt) id = 0 for ( num_shards, k_dtype, d_dtype, initial_mode, dim, run_step, ) in itertools.product( [3], [dtypes.int64], [ dtypes.float32, ], [ "constant", ], [1, 10], [10], ): with ops.Graph().as_default(): id += 1 raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] raw_init_vals = [ [ x, ] * dim for x in [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81] ] raw_ids = constant_op.constant([1, 3, 3, 9], dtype=k_dtype) sp_ids = sparse_tensor.SparseTensor( indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids, dense_shape=[3, 2], ) x = constant_op.constant([[_x * dim] for _x in [[0.4], [0.5], [0.6]]], dtype=d_dtype) x = array_ops.reshape(x, shape=(3 * dim, 1)) # base var prepare base_var = variables.Variable( np.array(raw_init_vals).reshape([len(raw_init_ids), dim]), dtype=d_dtype, shape=[len(raw_init_ids), dim], ) # test var prepare embeddings = de.get_variable( "t1030-" + name + str(id), key_dtype=k_dtype, value_dtype=d_dtype, devices=_get_devices() * num_shards, initializer=1.0, dim=dim, ) init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype) init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype) init_op = embeddings.upsert(init_ids, init_vals) # base branch base_embedding = embedding_ops.embedding_lookup_sparse( base_var, sp_ids, None, combiner="sum") base_embedding = array_ops.reshape(base_embedding, shape=[1, 3 * dim]) pred0 = math_ops.matmul(base_embedding, x) loss0 = pred0 * pred0 base_opt_op = base_opt.minimize(loss0, var_list=[base_var]) # test branch test_var, trainable = de.embedding_lookup_sparse( embeddings, sp_ids, sp_weights=None, combiner="sum", return_trainable=True, ) pred1 = math_ops.matmul( array_ops.reshape(test_var, shape=[1, 3 * dim]), x) loss1 = pred1 * pred1 gstep = training_util.create_global_step() test_opt_op = test_opt.minimize(loss1, var_list=[trainable], global_step=gstep) table_var = array_ops.reshape(embeddings.lookup(init_ids), shape=[10, dim]) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: sess.run(init_op) self.assertAllCloseAccordingToType( np.array(raw_init_vals).reshape( [len(raw_init_ids), dim]), sess.run(base_var), ) # run base for _ in range(run_step): sess.run(base_opt_op) sess.run(test_opt_op) # Validate global_step self.assertEqual(run_step, sess.run(gstep)) # Validate updated params self.assertAllCloseAccordingToType( sess.run(base_var), sess.run(table_var), msg="Cond:{},{},{},{},{}".format( num_shards, k_dtype, d_dtype, dim, run_step), ) self.device_check(embeddings)
def _train_model(self, input_fn, hooks): all_hooks = [] self._graph = ops.Graph() with self._graph.as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = training.get_or_create_global_step(g) features, labels = input_fn() estimator_spec = self._call_model_fn(features, labels, ModeKeys.TRAIN) all_hooks.extend([ plx_hooks.NanTensorHook(estimator_spec.loss), plx_hooks.LoggingTensorHook( { 'loss': estimator_spec.loss, 'step': global_step }, every_n_iter=100) ]) all_hooks.extend(hooks) all_hooks.extend(estimator_spec.training_hooks) scaffold = estimator_spec.scaffold or monitored_session.Scaffold() if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, # TODO remove non restorable vars saver.Saver( sharded=True, # TODO `var_list` max_to_keep=self._config.keep_checkpoint_max, defer_build=True)) chief_hooks = [] if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps: saver_hook_exists = any([ isinstance(h, plx_hooks.CheckpointSaverHook) for h in (all_hooks + estimator_spec.training_hooks + chief_hooks + estimator_spec.training_chief_hooks) ]) if not saver_hook_exists: chief_hooks = [ plx_hooks.CheckpointSaverHook( self._model_dir, save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=scaffold) ] with monitored_session.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=scaffold, hooks=all_hooks + estimator_spec.training_hooks, chief_only_hooks=chief_hooks + estimator_spec.training_chief_hooks, save_checkpoint_secs=0, # Saving is handled by a hook. save_summaries_steps=self._config.save_summary_steps, config=self._session_config) as mon_sess: loss = None while not mon_sess.should_stop(): _, loss = mon_sess.run( [estimator_spec.train_op, estimator_spec.loss]) summary_io.SummaryWriterCache.clear() return loss
gradient_penalty_weight=1.0) l1_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1) gan_loss = tfgan.losses.combine_adversarial_loss(gan_loss, gan_model, l1_loss, weight_factor=FLAGS.weight_factor) train_ops = tfgan.gan_train_ops(gan_model,gan_loss,generator_optimizer=tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.85, beta2=0.999, epsilon=1e-5),discriminator_optimizer=tf.train.AdamOptimizer(learning_rate=0.000001, beta1=0.85, beta2=0.999, epsilon=1e-5)) #train_ops.global_step_inc_op = tf.train.get_global_step().assign_add(1) #store_output_and_check_loss(gan_loss, gan_model.generated_data, gan_model.real_data, num_of_samples=3, prefix='gen',logdir=log_folder) global_step_tensor = tf.Variable(1, trainable=False, name='global_step') global_step = tf.train.get_or_create_global_step() train_step_fn = tfgan.get_sequential_train_steps( train_steps=tf.contrib.gan.GANTrainSteps(10, 10)) with monitored_session.MonitoredTrainingSession(checkpoint_dir=log_folder) as session: loss = None for y in xrange(1,20): for x in xrange(0,500): cur_loss, _ = train_step_fn(session, train_ops, global_step, train_step_kwargs={}) gen_loss_np = session.run(gan_loss.generator_loss) dis_loss_np = session.run(gan_loss.discriminator_loss) if gen_loss_np < 170: store_output_and_check_loss(session, gan_loss, gan_model.generated_data,prefix='final_l_'+str(round(gen_loss_np))+ '_' + str(NUMBER_OF_NOTES) + '_gen_', play=False,num_of_samples=30) print('iteration:'+ str(y*x)) print('Generator loss: %f' % gen_loss_np) print('Discriminator loss: %f' % dis_loss_np)
def _train_model(self, input_fn, hooks): all_hooks = [] with ops.Graph().as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = training.get_or_create_global_step(g) features, labels = self._get_features_and_labels_from_input_fn( input_fn, Modes.TRAIN) estimator_spec = self._call_model_fn(features, labels, Modes.TRAIN) ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss) all_hooks.extend([ plx_hooks.NanTensorHook(estimator_spec.loss), plx_hooks.StepLoggingTensorHook( { 'loss': estimator_spec.loss, 'step': global_step }, every_n_iter=100) ]) all_hooks.extend(hooks) all_hooks.extend(estimator_spec.training_hooks) scaffold = estimator_spec.scaffold if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, # TODO remove non restorable vars saver.Saver( sharded=True, max_to_keep=self._config.keep_checkpoint_max, keep_checkpoint_every_n_hours=( self._config.keep_checkpoint_every_n_hours), defer_build=True, save_relative_paths=True)) chief_hooks = [] if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps: saver_hook_exists = any([ isinstance(h, plx_hooks.StepCheckpointSaverHook) for h in (all_hooks + chief_hooks + list(estimator_spec.training_chief_hooks)) ]) if not saver_hook_exists: chief_hooks += [ plx_hooks.StepCheckpointSaverHook( self._model_dir, save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=scaffold) ] if self._config.save_summary_steps: saver_hook_exists = any([ isinstance(h, plx_hooks.StepSummarySaverHook) for h in (all_hooks + chief_hooks + list(estimator_spec.training_chief_hooks)) ]) if not saver_hook_exists: chief_hooks += [ plx_hooks.StepSummarySaverHook( scaffold=scaffold, save_steps=self._config.save_summary_steps, output_dir=self._model_dir, ) ] with monitored_session.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=scaffold, hooks=all_hooks, chief_only_hooks=chief_hooks + list(estimator_spec.training_chief_hooks), save_checkpoint_secs= 0, # Saving checkpoint is handled by a hook. save_summaries_steps= 0, # Saving summaries is handled by a hook. config=self._session_config) as mon_sess: loss = None while not mon_sess.should_stop(): _, loss = mon_sess.run( [estimator_spec.train_op, estimator_spec.loss]) return loss
def train(train_op, logdir, master='', is_chief=True, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None): """Runs the training loop. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where the graph and checkpoints are saved. master: The URL of the master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. scaffold: An tf.train.Scaffold instance. hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the training loop. chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run inside the training loop for the chief trainer only. save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved using a default checkpoint saver. If `save_checkpoint_secs` is set to `None`, then the default checkpoint saver isn't used. save_summaries_steps: The frequency, in number of global steps, that the summaries are written to disk using a default summary saver. If `save_summaries_steps` is set to `None`, then the default summary saver isn't used. config: An instance of `tf.ConfigProto`. Returns: the value of the loss function after training. Raises: ValueError: if `logdir` is `None` and either `save_checkpoint_secs` or `save_summaries_steps` are `None. """ if logdir is None and is_chief: if save_summaries_steps: raise ValueError( 'logdir cannot be None when save_summaries_steps is not None') if save_checkpoint_secs: raise ValueError( 'logdir cannot be None when save_checkpoint_secs is not None') with monitored_session.MonitoredTrainingSession( master=master, is_chief=is_chief, checkpoint_dir=logdir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=save_checkpoint_secs, save_summaries_steps=save_summaries_steps, config=config) as session: loss = None while not session.should_stop(): loss = session.run(train_op) return loss
def train(train_op, logdir, master='', is_chief=True, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None, max_wait_secs=7200, run_metadata=None): """Runs the training loop. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where the graph and checkpoints are saved. master: The URL of the master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. scaffold: An tf.train.Scaffold instance. hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the training loop. chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run inside the training loop for the chief trainer only. save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved using a default checkpoint saver. If `save_checkpoint_secs` is set to `None`, then the default checkpoint saver isn't used. save_summaries_steps: The frequency, in number of global steps, that the summaries are written to disk using a default summary saver. If `save_summaries_steps` is set to `None`, then the default summary saver isn't used. config: An instance of `tf.ConfigProto`. max_wait_secs: Maximum time workers should wait for the session to become available. This should be kept relatively short to help detect incorrect code, but sometimes may need to be increased if the chief takes a while to start up. run_metadata: A [`RunMetadata`] protocol buffer. Returns: the value of the loss function after training. Raises: ValueError: if `logdir` is `None` and either `save_checkpoint_secs` or `save_summaries_steps` are `None. """ if logdir is None and is_chief: if save_summaries_steps: raise ValueError( 'logdir cannot be None when save_summaries_steps is not None') if save_checkpoint_secs: raise ValueError( 'logdir cannot be None when save_checkpoint_secs is not None') with monitored_session.MonitoredTrainingSession( master=master, is_chief=is_chief, checkpoint_dir=logdir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=save_checkpoint_secs, save_summaries_steps=save_summaries_steps, config=config, max_wait_secs=max_wait_secs) as session: loss = None while not session.should_stop(): loss = session.run(train_op, run_metadata=run_metadata) return loss
def common_minimize_trainable(self, base_opt, test_opt, name): if test_util.is_gpu_available(): keys_type_list = [dtypes.int64] else: keys_type_list = [dtypes.int64, dtypes.string] deo.enable_train_mode() for run_id, num_shards, k_dtype, d_dtype, initial_mode, dim, run_step \ in _next_run_step_config(keys_type_list): with ops.Graph().as_default(): raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] if k_dtype == dtypes.string: raw_init_ids = [str(i) for i in raw_init_ids] raw_init_vals = [ [ x, ] * dim for x in [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81] ] raw_ids_py = [1, 3, 3, 9] raw_ids_nn = constant_op.constant(raw_ids_py, dtype=dtypes.int64) raw_ids_de = raw_ids_nn if k_dtype == dtypes.string: raw_ids_de = constant_op.constant( [str(i) for i in raw_ids_py], dtype=k_dtype) sp_ids_nn = sparse_tensor.SparseTensor(indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids_nn, dense_shape=[3, 2]) sp_ids_de = sparse_tensor.SparseTensor(indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids_de, dense_shape=[3, 2]) x = constant_op.constant([[_x * dim] for _x in [[0.4], [0.5], [0.6]]], dtype=d_dtype) x = array_ops.reshape(x, shape=(3 * dim, 1)) # base var prepare base_var = variables.Variable(np.array(raw_init_vals).reshape( [len(raw_init_ids), dim]), dtype=d_dtype, shape=[len(raw_init_ids), dim]) # test var prepare embeddings = deo.get_variable('t1030-' + name + str(run_id), key_dtype=k_dtype, value_dtype=d_dtype, devices=_get_devices() * num_shards, initializer=1., dim=dim) init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype) init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype) init_op = embeddings.upsert(init_ids, init_vals) # base branch base_embedding = embedding_ops.embedding_lookup_sparse( base_var, sp_ids_nn, None, combiner='sum') base_embedding = array_ops.reshape(base_embedding, shape=[1, 3 * dim]) pred0 = math_ops.matmul(base_embedding, x) loss0 = pred0 * pred0 base_opt_op = base_opt.minimize(loss0, var_list=[base_var]) # test branch test_var, trainable = deo.embedding_lookup_sparse( embeddings, sp_ids_de, sp_weights=None, combiner="sum", return_trainable=True) pred1 = math_ops.matmul( array_ops.reshape(test_var, shape=[1, 3 * dim]), x) loss1 = pred1 * pred1 gstep = training_util.create_global_step() test_opt_op = test_opt.minimize(loss1, var_list=[trainable], global_step=gstep) table_var = array_ops.reshape(embeddings.lookup(init_ids), shape=[10, dim]) with monitored_session.MonitoredTrainingSession( is_chief=True, config=default_config) as sess: sess.run(init_op) self.assertAllCloseAccordingToType( np.array(raw_init_vals).reshape( [len(raw_init_ids), dim]), sess.run(base_var)) # run base for _ in range(run_step): sess.run(base_opt_op) sess.run(test_opt_op) # Validate global_step self.assertEqual(run_step, sess.run(gstep)) # Validate updated params self.assertAllCloseAccordingToType( sess.run(base_var), sess.run(table_var), msg="Cond:{},{},{},{},{}".format( num_shards, k_dtype, d_dtype, dim, run_step)) self.device_check(embeddings)