def test_saving_restoring_checkpoint(self):

        logdir = _test_dir(self.get_temp_dir(),
                           "test_saving_restoring_checkpoint")
        with ops.Graph().as_default():
            gstep = training_util.create_global_step()
            do_step = state_ops.assign_add(gstep, 1)

            v0 = variables.Variable(10.0, name="v0")
            v1 = variables.Variable(20.0, name="v1")

            target_values = [[0.0], [1.0], [2.0]]
            keys = array_ops.placeholder(dtypes.int64)
            values = constant_op.constant(target_values, dtypes.float32)

            table = de.Variable(
                key_dtype=dtypes.int64,
                value_dtype=dtypes.float32,
                initializer=-1.0,
                name="m100",
                dim=1,
            )
            upsert_op = table.upsert(keys, values)
            lookup_op = table.lookup(keys)
            size_op = table.size()
            with monitored_session.MonitoredTrainingSession(
                    config=default_config, is_chief=True,
                    checkpoint_dir=logdir) as sess:
                self.assertEqual(0, sess.run(gstep))
                self.assertEqual(1, sess.run(do_step))
                self.assertEqual(2, sess.run(do_step))

                # Check that the parameter nodes have been initialized.
                self.assertEqual(10.0, sess.run(v0))
                self.assertEqual(20.0, sess.run(v1))
                self.assertAllEqual(0, sess.run(size_op))
                sess.run(upsert_op, feed_dict={keys: [0, 1, 2]})
                self.assertAllEqual(3, sess.run(size_op))
                self.device_check(table)

            # A restart will find the checkpoint and recover automatically.
            with monitored_session.MonitoredTrainingSession(
                    config=default_config, is_chief=True,
                    checkpoint_dir=logdir) as sess:
                self.assertEqual(2, sess.run(gstep))
                self.assertAllEqual(3, sess.run(table.size()))
                self.assertAllEqual(
                    target_values,
                    sess.run(lookup_op, feed_dict={keys: [0, 1, 2]}))

                self.device_check(table)
Exemple #2
0
 def _test_logits(self, mode, hidden_units, logits_dimension, inputs,
                  expected_logits):
     """Tests that the expected logits are passed to mock head."""
     with ops.Graph().as_default():
         training_util.create_global_step()
         head = _mock_head(self,
                           hidden_units=hidden_units,
                           logits_dimension=logits_dimension,
                           expected_logits=expected_logits)
         estimator_spec = dnn._dnn_model_fn(
             features={'age': constant_op.constant(inputs)},
             labels=constant_op.constant([[1]]),
             mode=mode,
             head=head,
             hidden_units=hidden_units,
             feature_columns=[
                 feature_column.numeric_column(
                     'age', shape=np.array(inputs).shape[1:])
             ],
             optimizer=_mock_optimizer(self, hidden_units))
         with monitored_session.MonitoredTrainingSession(
                 checkpoint_dir=self._model_dir) as sess:
             if mode == model_fn.ModeKeys.TRAIN:
                 sess.run(estimator_spec.train_op)
             elif mode == model_fn.ModeKeys.EVAL:
                 sess.run(estimator_spec.loss)
             elif mode == model_fn.ModeKeys.PREDICT:
                 sess.run(estimator_spec.predictions)
             else:
                 self.fail('Invalid mode: {}'.format(mode))
Exemple #3
0
def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
                      global_step, model_dir):
    """Create checkpoint file with provided model weights.

  Args:
    rnn_weights: Iterable of values of weights for the RNN cell.
    rnn_biases: Iterable of values of biases for the RNN cell.
    logits_weights: Iterable of values for matrix connecting RNN output to
      logits.
    logits_biases: Iterable of values for logits bias term.
    global_step: Initial global step to save in checkpoint.
    model_dir: Directory into which checkpoint is saved.
  """
    model_weights = {}
    model_weights[CELL_WEIGHTS_NAME] = rnn_weights
    model_weights[CELL_BIAS_NAME] = rnn_biases
    model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
    model_weights[LOGITS_BIAS_NAME] = logits_biases

    with ops.Graph().as_default():
        # Create model variables.
        for k, v in six.iteritems(model_weights):
            variables_lib.Variable(v, name=k, dtype=dtypes.float32)

        # Create non-model variables.
        global_step_var = training_util.create_global_step()
        assign_op = global_step_var.assign(global_step)

        # Initialize vars and save checkpoint.
        with monitored_session.MonitoredTrainingSession(
                checkpoint_dir=model_dir) as sess:
            sess.run(assign_op)
Exemple #4
0
 def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
                  sequence_feature_columns, context_feature_columns,
                  expected_logits):
     """Tests that the expected logits are calculated."""
     with ops.Graph().as_default():
         # Global step needed for MonitoredSession, which is in turn used to
         # explicitly set variable weights through a checkpoint.
         training_util.create_global_step()
         # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
         # the checkpoint naming is shared.
         with variable_scope.variable_scope('rnn'):
             input_layer_partitioner = (
                 partitioned_variables.min_max_variable_partitioner(
                     max_partitions=0, min_slice_size=64 << 20))
             logit_fn = rnn._rnn_logit_fn_builder(
                 output_units=logits_dimension,
                 rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
                 sequence_feature_columns=sequence_feature_columns,
                 context_feature_columns=context_feature_columns,
                 input_layer_partitioner=input_layer_partitioner)
             # Features are constructed within this function, otherwise the Tensors
             # containing the features would be defined outside this graph.
             logits = logit_fn(features=features_fn(), mode=mode)
             with monitored_session.MonitoredTrainingSession(
                     checkpoint_dir=self._model_dir) as sess:
                 self.assertAllClose(expected_logits,
                                     sess.run(logits),
                                     atol=1e-4)
 def _test_logits_helper(self, mode):
     """Tests that the expected logits are passed to mock head."""
     with ops.Graph().as_default():
         training_util.get_or_create_global_step()
         generator_inputs = {'x': array_ops.zeros([5, 4])}
         real_data = (None if mode == model_fn_lib.ModeKeys.PREDICT else
                      array_ops.zeros([5, 4]))
         generator_scope_name = 'generator'
         head = mock_head(self,
                          expected_generator_inputs=generator_inputs,
                          expected_real_data=real_data,
                          generator_scope_name=generator_scope_name)
         estimator_spec = estimator._gan_model_fn(
             features=generator_inputs,
             labels=real_data,
             mode=mode,
             generator_fn=generator_fn,
             discriminator_fn=discriminator_fn,
             generator_scope_name=generator_scope_name,
             head=head)
         with monitored_session.MonitoredTrainingSession(
                 checkpoint_dir=self._model_dir) as sess:
             if mode == model_fn_lib.ModeKeys.TRAIN:
                 sess.run(estimator_spec.train_op)
             elif mode == model_fn_lib.ModeKeys.EVAL:
                 sess.run(estimator_spec.loss)
             elif mode == model_fn_lib.ModeKeys.PREDICT:
                 sess.run(estimator_spec.predictions)
             else:
                 self.fail('Invalid mode: {}'.format(mode))
    def test_checkpoint_overwrite_warm_start(self):
        extra_run_step = 2
        ws_ckpt_dir = tempfile.mkdtemp(
            prefix=os.path.join(self.get_temp_dir(), "warm_start"))
        final_ckpt_dir = tempfile.mkdtemp(
            prefix=os.path.join(self.get_temp_dir(), "final"))
        for run_id, num_shards, k_dtype, d_dtype, init_mode, dim, run_step \
            in _next_run_step_config():
            error_msg = "Cond:{},{},{},{},{},{}".format(
                num_shards, k_dtype, d_dtype, init_mode, dim, run_step)
            with ops.Graph().as_default() as g:
                with self.session(graph=g,
                                  use_gpu=test_util.is_gpu_available(),
                                  config=default_config) as sess:
                    training_util.create_global_step()
                    graph = TestGraph(k_dtype, d_dtype, dim, num_shards, 'var',
                                      'devar', run_id)
                    self.evaluate(variables.global_variables_initializer())
                    sess.run([graph.devar_init_op])
                    prev_x = sess.run([graph.x])[0]
                    for _ in range(run_step):
                        sess.run([graph.var_opt_op, graph.devar_opt_op])
                    saver_lib.Saver().save(sess,
                                           os.path.join(ws_ckpt_dir, "model"))
                    prev_ws_var_loss, prev_ws_devar_loss = sess.run(
                        [graph.var_loss, graph.devar_loss])
                    self.assertAllCloseAccordingToType(prev_ws_var_loss,
                                                       prev_ws_devar_loss,
                                                       msg=error_msg)
                    for _ in range(extra_run_step):
                        sess.run([graph.var_opt_op, graph.devar_opt_op])
                    saver_lib.Saver().save(
                        sess, os.path.join(final_ckpt_dir, "model"))
                    prev_final_var_loss, prev_final_devar_loss = sess.run(
                        [graph.var_loss, graph.devar_loss])
                    self.assertAllCloseAccordingToType(prev_final_var_loss,
                                                       prev_final_devar_loss,
                                                       msg=error_msg)

            with ops.Graph().as_default():
                training_util.create_global_step()
                graph = TestGraph(k_dtype, d_dtype, dim, num_shards, 'var',
                                  'devar', run_id, prev_x)
                ws_util.warm_start(ws_ckpt_dir, vars_to_warm_start=['.*'])
                with monitored_session.MonitoredTrainingSession(
                        config=default_config,
                        is_chief=True,
                        checkpoint_dir=final_ckpt_dir) as sess:
                    var_loss, devar_loss = sess.run(
                        [graph.var_loss, graph.devar_loss])
                    self.assertAllCloseAccordingToType(var_loss,
                                                       prev_final_var_loss,
                                                       msg=error_msg)
                    self.assertAllCloseAccordingToType(devar_loss,
                                                       prev_final_devar_loss,
                                                       msg=error_msg)
    def common_minimize_trainable(self, base_opt, test_opt, name):
        tf.config.set_soft_device_placement(True)
        hvd.init()
        base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True)
        for dtype, run_step, dim in itertools.product([dtypes.float32], [1],
                                                      [10]):
            x = tf.random.uniform(shape=[32, dim])
            y = tf.zeros([32, 1])

            global_step = training_util.create_global_step()

            base_weight = tf.compat.v1.get_variable(name="base_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            base_logits = tf.nn.relu(math_ops.matmul(x, base_weight))
            base_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=base_logits)

            base_opt_op = base_opt.minimize(base_loss,
                                            global_step,
                                            var_list=[base_weight])

            test_weight = tf.compat.v1.get_variable(name="test_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            test_logits = tf.nn.relu(math_ops.matmul(x, test_weight))
            test_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=test_logits)

            grads_and_vars = test_opt.compute_gradients(test_loss,
                                                        var_list=[test_weight])
            var_list = []
            aggregated_grad = []
            for grad, var in grads_and_vars:
                var_list.append(var)
                aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum))
            aggregated_grads_and_vars = zip(aggregated_grad, var_list)
            test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars,
                                                   global_step)

            with monitored_session.MonitoredTrainingSession(
                    is_chief=True, config=default_config) as sess:

                for _ in range(run_step):
                    sess.run(base_opt_op)
                    sess.run(test_opt_op)

                self.assertAllCloseAccordingToType(
                    sess.run(base_weight),
                    sess.run(test_weight),
                    msg="Cond:{},{},{}".format(dtype, run_step, dim),
                )
Exemple #8
0
    def test_multi_feature_column_multi_dim_logits(self):
        """Tests multiple feature columns and multi-dimensional logits.

    All numbers are the same as test_multi_dim_input_multi_dim_logits. The only
    difference is that the input consists of two 1D feature columns, instead of
    one 2D feature column.
    """
        base_global_step = 100
        _create_checkpoint((
            ([[.6, .5], [-.6, -.5]], [.1, -.1]),
            ([[1., .8], [-.8, -1.]], [.2, -.2]),
            ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),
        ), base_global_step, self._model_dir)
        hidden_units = (2, 2)
        logits_dimension = 3
        inputs = ([[10.]], [[8.]])
        expected_logits = [[-0.48, 0.48, 0.39]]

        for mode in [
                model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
                model_fn.ModeKeys.PREDICT
        ]:
            with ops.Graph().as_default():
                training_util.create_global_step()
                head = _mock_head(self,
                                  hidden_units=hidden_units,
                                  logits_dimension=logits_dimension,
                                  expected_logits=expected_logits)
                estimator_spec = dnn._dnn_model_fn(
                    features={
                        'age': constant_op.constant(inputs[0]),
                        'height': constant_op.constant(inputs[1])
                    },
                    labels=constant_op.constant([[1]]),
                    mode=mode,
                    head=head,
                    hidden_units=hidden_units,
                    feature_columns=[
                        feature_column.numeric_column('age'),
                        feature_column.numeric_column('height')
                    ],
                    optimizer=_mock_optimizer(self, hidden_units))
                with monitored_session.MonitoredTrainingSession(
                        checkpoint_dir=self._model_dir) as sess:
                    if mode == model_fn.ModeKeys.TRAIN:
                        sess.run(estimator_spec.train_op)
                    elif mode == model_fn.ModeKeys.EVAL:
                        sess.run(estimator_spec.loss)
                    elif mode == model_fn.ModeKeys.PREDICT:
                        sess.run(estimator_spec.predictions)
                    else:
                        self.fail('Invalid mode: {}'.format(mode))
Exemple #9
0
 def test_reads_before_increments(self):
     with ops.Graph().as_default():
         training_util.create_global_step()
         read_tensor = training_util._get_or_create_global_step_read()
         inc_op = training_util._increment_global_step(1)
         inc_three_op = training_util._increment_global_step(3)
         with monitored_session.MonitoredTrainingSession() as sess:
             read_value, _ = sess.run([read_tensor, inc_op])
             self.assertEqual(0, read_value)
             read_value, _ = sess.run([read_tensor, inc_three_op])
             self.assertEqual(1, read_value)
             read_value = sess.run(read_tensor)
             self.assertEqual(4, read_value)
  def test_inference_numberic_correctness(self):
    train_pred = None
    infer_pred = None
    dim = 8
    initializer = init_ops.random_normal_initializer(0.0, 0.001)
    raw_init_vals = np.random.rand(100, dim)

    for fn in [de.enable_train_mode, de.enable_inference_mode]:
      with ops.Graph().as_default():
        fn()

        init_ids = constant_op.constant(list(range(100)), dtype=dtypes.int64)
        init_vals = constant_op.constant(raw_init_vals, dtype=dtypes.float32)
        with variable_scope.variable_scope("modelmode",
                                           reuse=variable_scope.AUTO_REUSE):
          embeddings = de.get_variable('ModelModeTest-numberic',
                                       key_dtype=dtypes.int64,
                                       value_dtype=dtypes.float32,
                                       devices=_get_devices() * 2,
                                       initializer=initializer,
                                       dim=dim)

          w = variables.Variable(1.0, name="w")
          _ = training_util.create_global_step()
        init_op = embeddings.upsert(init_ids, init_vals)

        ids = constant_op.constant([0, 1, 2, 3, 4], dtype=dtypes.int64)
        test_var, trainable = de.embedding_lookup([embeddings],
                                                  ids,
                                                  return_trainable=True)
        pred = math_ops.add(test_var, 1) * w
        loss = pred * pred
        opt = de.DynamicEmbeddingOptimizer(adagrad.AdagradOptimizer(0.1))
        opt.minimize(loss)

        with monitored_session.MonitoredTrainingSession(
            is_chief=True, config=default_config) as sess:
          if de.get_model_mode() == de.ModelMode.TRAIN:
            sess.run(init_op)
            train_pred = sess.run(pred)
          elif de.get_model_mode() == de.ModelMode.INFERENCE:
            sess.run(init_op)
            infer_pred = sess.run(pred)
      de.enable_train_mode()
      ops.reset_default_graph()
    self.assertAllEqual(train_pred, infer_pred)
Exemple #11
0
    def testTrainingLoop(self):
        random_seed.set_random_seed(1)

        # Model
        with ops.device("/device:IPU:0"):
            with variable_scope.variable_scope("vs", use_resource=True):
                x = array_ops.placeholder(np.float32, [4, 1, 4], name="a")
                l = array_ops.placeholder(np.float32, [4, 1, 1], name="b")

                y = layers.dense(x, 1, activation=nn.sigmoid)

                loss = losses.log_loss(l, y)
                train_op = gradient_descent.GradientDescentOptimizer(0.1) \
                                           .minimize(loss)

                init = variables.global_variables_initializer()

        # Test data
        image_data = [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]],
                      [[4, 4, 4, 4]]]
        label_data = [[[1]], [[2]], [[3]], [[4]]]

        # Run training.
        with ms.MonitoredTrainingSession(is_chief=True,
                                         chief_only_hooks=None,
                                         save_summaries_steps=None,
                                         save_summaries_secs=None) as sess:
            sess.run(init)
            previous_loss = float("inf")
            for _ in range(5):
                measured_loss, _ = sess.run([loss, train_op],
                                            feed_dict={
                                                x: image_data,
                                                l: label_data
                                            })
                self.assertTrue(measured_loss < previous_loss)
                previous_loss = measured_loss
Exemple #12
0
    def _train_model(self, checkpoint_dir, num_steps):
        """Trains a simple classification model.

    Note that the data has been configured such that after around 300 steps,
    the model has memorized the dataset (e.g. we can expect %100 accuracy).

    Args:
      checkpoint_dir: The directory where the checkpoint is written to.
      num_steps: The number of steps to train for.
    """
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            tf_predictions = logistic_classifier(tf_inputs)
            loss_op = losses.log_loss(labels=tf_labels,
                                      predictions=tf_predictions)

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)
            train_op = optimizer.minimize(loss_op,
                                          training.get_or_create_global_step())

            with monitored_session.MonitoredTrainingSession(
                    checkpoint_dir=checkpoint_dir,
                    hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)
                           ]) as session:
                loss = None
                while not session.should_stop():
                    _, loss = session.run([train_op, loss_op])

                if num_steps >= 300:
                    assert loss < .015
Exemple #13
0
    def common_minimize_trainable(self, base_opt, test_opt, name):
        from tensorflow.python.framework.errors_impl import NotFoundError

        # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+.
        try:
            import horovod.tensorflow as hvd
        except NotFoundError:
            self.skipTest(
                "Skip the test for horovod import error with Tensorflow-2.7.0 on MacOS-12."
            )

        tf.config.set_soft_device_placement(True)
        hvd.init()
        base_opt = de.DynamicEmbeddingOptimizer(base_opt, synchronous=True)
        for dtype, run_step, dim in itertools.product([dtypes.float32], [1],
                                                      [10]):
            x = tf.random.uniform(shape=[32, dim])
            y = tf.zeros([32, 1])

            global_step = training_util.create_global_step()

            base_weight = tf.compat.v1.get_variable(name="base_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            base_logits = tf.nn.relu(math_ops.matmul(x, base_weight))
            base_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=base_logits)

            base_opt_op = base_opt.minimize(base_loss,
                                            global_step,
                                            var_list=[base_weight])

            test_weight = tf.compat.v1.get_variable(name="test_weights",
                                                    initializer=tf.ones(
                                                        [10, 1]))

            test_logits = tf.nn.relu(math_ops.matmul(x, test_weight))
            test_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=y, logits=test_logits)

            grads_and_vars = test_opt.compute_gradients(test_loss,
                                                        var_list=[test_weight])
            var_list = []
            aggregated_grad = []
            for grad, var in grads_and_vars:
                var_list.append(var)
                aggregated_grad.append(hvd.allreduce(grad, op=hvd.Sum))
            aggregated_grads_and_vars = zip(aggregated_grad, var_list)
            test_opt_op = test_opt.apply_gradients(aggregated_grads_and_vars,
                                                   global_step)

            with monitored_session.MonitoredTrainingSession(
                    is_chief=True, config=default_config) as sess:

                for _ in range(run_step):
                    sess.run(base_opt_op)
                    sess.run(test_opt_op)

                self.assertAllCloseAccordingToType(
                    sess.run(base_weight),
                    sess.run(test_weight),
                    msg="Cond:{},{},{}".format(dtype, run_step, dim),
                )
Exemple #14
0
    def _train_model(self, env, first_update, update_frequency, hooks):
        all_hooks = []
        self._graph = ops.Graph()
        with self._graph.as_default() as g, g.device(self._device_fn):
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step = training.get_or_create_global_step(g)
            global_episode = get_or_create_global_episode(g)
            global_timestep = get_or_create_global_timestep(g)
            update_episode_op = tf.assign_add(global_episode, 1)
            update_timestep_op = tf.assign_add(global_timestep, 1)
            no_run_hooks = tf.no_op(name='no_run_hooks')
            with ops.device('/cpu:0'):
                features, labels = self._prepare_input_fn(Modes.TRAIN, env)
            estimator_spec = self._call_model_fn(features, labels, Modes.TRAIN)
            ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
            all_hooks.extend([
                plx_hooks.NanTensorHook(estimator_spec.loss),
                plx_hooks.StepLoggingTensorHook(
                    {
                        'loss': estimator_spec.loss,
                        'step': global_step,
                        'timestep': global_timestep,
                        'global_episode': global_episode,
                        'max_reward': labels['max_reward'],
                        'min_reward': labels['min_reward'],
                        'total_reward': labels['total_reward'],
                    },
                    every_n_iter=100)
            ])
            all_hooks.extend(hooks)
            all_hooks.extend(estimator_spec.training_hooks)

            scaffold = estimator_spec.scaffold or monitored_session.Scaffold()
            if not (scaffold.saver
                    or ops.get_collection(ops.GraphKeys.SAVERS)):
                ops.add_to_collection(
                    ops.GraphKeys.SAVERS,  # TODO remove non restorable vars
                    saver.Saver(
                        sharded=True,  # TODO `var_list`
                        max_to_keep=self._config.keep_checkpoint_max,
                        defer_build=True))

            chief_hooks = [
                plx_hooks.EpisodeLoggingTensorHook(
                    {
                        'loss': estimator_spec.loss,
                        'step': global_step,
                        'global_timestep': global_timestep,
                        'global_episode': global_episode,
                        'max_reward': labels['max_reward'],
                        'min_reward': labels['min_reward'],
                        'total_reward': labels['total_reward'],
                    },
                    every_n_episodes=1),  # TODO: save every episode?
                plx_hooks.EpisodeCounterHook(output_dir=self.model_dir)
            ]
            if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps:
                saver_hook_exists = any([
                    isinstance(h, plx_hooks.EpisodeCheckpointSaverHook)
                    for h in (all_hooks + chief_hooks +
                              list(estimator_spec.training_chief_hooks))
                ])
                if not saver_hook_exists:
                    chief_hooks += [
                        plx_hooks.EpisodeCheckpointSaverHook(
                            self._model_dir,
                            save_episodes=1,  # TODO: save every episode?
                            scaffold=scaffold)
                    ]
            if self._config.save_summary_steps:
                saver_hook_exists = any([
                    isinstance(h, plx_hooks.EpisodeSummarySaverHook)
                    for h in (all_hooks + chief_hooks +
                              list(estimator_spec.training_chief_hooks))
                ])
                if not saver_hook_exists:
                    chief_hooks += [
                        plx_hooks.EpisodeSummarySaverHook(
                            scaffold=scaffold,
                            save_episodes=1,  # TODO: save every episode?
                            output_dir=self._model_dir,
                        )
                    ]
            with monitored_session.MonitoredTrainingSession(
                    master=self._config.master,
                    is_chief=self._config.is_chief,
                    checkpoint_dir=self._model_dir,
                    scaffold=scaffold,
                    hooks=all_hooks,
                    chief_only_hooks=chief_hooks +
                    list(estimator_spec.training_chief_hooks),
                    save_checkpoint_secs=
                    0,  # Saving checkpoint is handled by a hook.
                    save_summaries_steps=
                    0,  # Saving summaries is handled by a hook.
                    config=self._session_config) as mon_sess:
                loss = None
                while not mon_sess.should_stop():
                    loss = self.run_episode(
                        env=env,
                        sess=mon_sess,
                        features=features,
                        labels=labels,
                        no_run_hooks=no_run_hooks,
                        global_step=global_step,
                        update_episode_op=update_episode_op,
                        update_timestep_op=update_timestep_op,
                        first_update=first_update,
                        update_frequency=update_frequency,
                        estimator_spec=estimator_spec)
            summary_io.SummaryWriterCache.clear()
            return loss
    def common_minimize_trainable(self, base_opt, test_opt, name):
        base_opt = de.DynamicEmbeddingOptimizer(base_opt)
        test_opt = de.DynamicEmbeddingOptimizer(test_opt)
        id = 0
        for (
                num_shards,
                k_dtype,
                d_dtype,
                initial_mode,
                dim,
                run_step,
        ) in itertools.product(
            [3],
            [dtypes.int64],
            [
                dtypes.float32,
            ],
            [
                "constant",
            ],
            [1, 10],
            [10],
        ):
            with ops.Graph().as_default():
                id += 1
                raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                raw_init_vals = [
                    [
                        x,
                    ] * dim for x in
                    [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81]
                ]
                raw_ids = constant_op.constant([1, 3, 3, 9], dtype=k_dtype)
                sp_ids = sparse_tensor.SparseTensor(
                    indices=[
                        [0, 0],
                        [0, 1],
                        [1, 0],
                        [2, 1],
                    ],
                    values=raw_ids,
                    dense_shape=[3, 2],
                )
                x = constant_op.constant([[_x * dim]
                                          for _x in [[0.4], [0.5], [0.6]]],
                                         dtype=d_dtype)
                x = array_ops.reshape(x, shape=(3 * dim, 1))
                # base var prepare
                base_var = variables.Variable(
                    np.array(raw_init_vals).reshape([len(raw_init_ids), dim]),
                    dtype=d_dtype,
                    shape=[len(raw_init_ids), dim],
                )

                # test var prepare
                embeddings = de.get_variable(
                    "t1030-" + name + str(id),
                    key_dtype=k_dtype,
                    value_dtype=d_dtype,
                    devices=_get_devices() * num_shards,
                    initializer=1.0,
                    dim=dim,
                )

                init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype)
                init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype)
                init_op = embeddings.upsert(init_ids, init_vals)

                # base branch
                base_embedding = embedding_ops.embedding_lookup_sparse(
                    base_var, sp_ids, None, combiner="sum")
                base_embedding = array_ops.reshape(base_embedding,
                                                   shape=[1, 3 * dim])
                pred0 = math_ops.matmul(base_embedding, x)
                loss0 = pred0 * pred0

                base_opt_op = base_opt.minimize(loss0, var_list=[base_var])

                # test branch
                test_var, trainable = de.embedding_lookup_sparse(
                    embeddings,
                    sp_ids,
                    sp_weights=None,
                    combiner="sum",
                    return_trainable=True,
                )

                pred1 = math_ops.matmul(
                    array_ops.reshape(test_var, shape=[1, 3 * dim]), x)
                loss1 = pred1 * pred1

                gstep = training_util.create_global_step()
                test_opt_op = test_opt.minimize(loss1,
                                                var_list=[trainable],
                                                global_step=gstep)

                table_var = array_ops.reshape(embeddings.lookup(init_ids),
                                              shape=[10, dim])

                with monitored_session.MonitoredTrainingSession(
                        is_chief=True, config=default_config) as sess:
                    sess.run(init_op)
                    self.assertAllCloseAccordingToType(
                        np.array(raw_init_vals).reshape(
                            [len(raw_init_ids), dim]),
                        sess.run(base_var),
                    )

                    # run base
                    for _ in range(run_step):
                        sess.run(base_opt_op)
                        sess.run(test_opt_op)

                    # Validate global_step
                    self.assertEqual(run_step, sess.run(gstep))

                    # Validate updated params
                    self.assertAllCloseAccordingToType(
                        sess.run(base_var),
                        sess.run(table_var),
                        msg="Cond:{},{},{},{},{}".format(
                            num_shards, k_dtype, d_dtype, dim, run_step),
                    )
                    self.device_check(embeddings)
Exemple #16
0
    def _train_model(self, input_fn, hooks):
        all_hooks = []
        self._graph = ops.Graph()
        with self._graph.as_default() as g, g.device(self._device_fn):
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step = training.get_or_create_global_step(g)
            features, labels = input_fn()
            estimator_spec = self._call_model_fn(features, labels,
                                                 ModeKeys.TRAIN)
            all_hooks.extend([
                plx_hooks.NanTensorHook(estimator_spec.loss),
                plx_hooks.LoggingTensorHook(
                    {
                        'loss': estimator_spec.loss,
                        'step': global_step
                    },
                    every_n_iter=100)
            ])
            all_hooks.extend(hooks)
            all_hooks.extend(estimator_spec.training_hooks)

            scaffold = estimator_spec.scaffold or monitored_session.Scaffold()
            if not (scaffold.saver
                    or ops.get_collection(ops.GraphKeys.SAVERS)):
                ops.add_to_collection(
                    ops.GraphKeys.SAVERS,  # TODO remove non restorable vars
                    saver.Saver(
                        sharded=True,  # TODO `var_list`
                        max_to_keep=self._config.keep_checkpoint_max,
                        defer_build=True))

            chief_hooks = []
            if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps:
                saver_hook_exists = any([
                    isinstance(h, plx_hooks.CheckpointSaverHook)
                    for h in (all_hooks + estimator_spec.training_hooks +
                              chief_hooks +
                              estimator_spec.training_chief_hooks)
                ])
                if not saver_hook_exists:
                    chief_hooks = [
                        plx_hooks.CheckpointSaverHook(
                            self._model_dir,
                            save_secs=self._config.save_checkpoints_secs,
                            save_steps=self._config.save_checkpoints_steps,
                            scaffold=scaffold)
                    ]
            with monitored_session.MonitoredTrainingSession(
                    master=self._config.master,
                    is_chief=self._config.is_chief,
                    checkpoint_dir=self._model_dir,
                    scaffold=scaffold,
                    hooks=all_hooks + estimator_spec.training_hooks,
                    chief_only_hooks=chief_hooks +
                    estimator_spec.training_chief_hooks,
                    save_checkpoint_secs=0,  # Saving is handled by a hook.
                    save_summaries_steps=self._config.save_summary_steps,
                    config=self._session_config) as mon_sess:
                loss = None
                while not mon_sess.should_stop():
                    _, loss = mon_sess.run(
                        [estimator_spec.train_op, estimator_spec.loss])
            summary_io.SummaryWriterCache.clear()
            return loss
Exemple #17
0
    gradient_penalty_weight=1.0)

l1_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)

gan_loss = tfgan.losses.combine_adversarial_loss(gan_loss, gan_model, l1_loss, weight_factor=FLAGS.weight_factor)

train_ops = tfgan.gan_train_ops(gan_model,gan_loss,generator_optimizer=tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.85, beta2=0.999, epsilon=1e-5),discriminator_optimizer=tf.train.AdamOptimizer(learning_rate=0.000001, beta1=0.85, beta2=0.999, epsilon=1e-5))
#train_ops.global_step_inc_op = tf.train.get_global_step().assign_add(1)


#store_output_and_check_loss(gan_loss, gan_model.generated_data, gan_model.real_data, num_of_samples=3, prefix='gen',logdir=log_folder)

global_step_tensor = tf.Variable(1, trainable=False, name='global_step')
global_step = tf.train.get_or_create_global_step()
train_step_fn = tfgan.get_sequential_train_steps( train_steps=tf.contrib.gan.GANTrainSteps(10, 10))
with monitored_session.MonitoredTrainingSession(checkpoint_dir=log_folder) as session:
    loss = None
    for y in xrange(1,20):
        for x in xrange(0,500):
            cur_loss, _ = train_step_fn(session, train_ops, global_step, train_step_kwargs={})

            gen_loss_np = session.run(gan_loss.generator_loss)
            dis_loss_np = session.run(gan_loss.discriminator_loss)

            if gen_loss_np < 170:
                store_output_and_check_loss(session, gan_loss, gan_model.generated_data,prefix='final_l_'+str(round(gen_loss_np))+ '_' + str(NUMBER_OF_NOTES) + '_gen_', play=False,num_of_samples=30)
            print('iteration:'+ str(y*x))
            print('Generator loss: %f' % gen_loss_np)
            print('Discriminator loss: %f' % dis_loss_np)

Exemple #18
0
    def _train_model(self, input_fn, hooks):
        all_hooks = []
        with ops.Graph().as_default() as g, g.device(self._device_fn):
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step = training.get_or_create_global_step(g)
            features, labels = self._get_features_and_labels_from_input_fn(
                input_fn, Modes.TRAIN)
            estimator_spec = self._call_model_fn(features, labels, Modes.TRAIN)
            ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
            all_hooks.extend([
                plx_hooks.NanTensorHook(estimator_spec.loss),
                plx_hooks.StepLoggingTensorHook(
                    {
                        'loss': estimator_spec.loss,
                        'step': global_step
                    },
                    every_n_iter=100)
            ])
            all_hooks.extend(hooks)
            all_hooks.extend(estimator_spec.training_hooks)

            scaffold = estimator_spec.scaffold
            if not (scaffold.saver
                    or ops.get_collection(ops.GraphKeys.SAVERS)):
                ops.add_to_collection(
                    ops.GraphKeys.SAVERS,  # TODO remove non restorable vars
                    saver.Saver(
                        sharded=True,
                        max_to_keep=self._config.keep_checkpoint_max,
                        keep_checkpoint_every_n_hours=(
                            self._config.keep_checkpoint_every_n_hours),
                        defer_build=True,
                        save_relative_paths=True))

            chief_hooks = []
            if self._config.save_checkpoints_secs or self._config.save_checkpoints_steps:
                saver_hook_exists = any([
                    isinstance(h, plx_hooks.StepCheckpointSaverHook)
                    for h in (all_hooks + chief_hooks +
                              list(estimator_spec.training_chief_hooks))
                ])
                if not saver_hook_exists:
                    chief_hooks += [
                        plx_hooks.StepCheckpointSaverHook(
                            self._model_dir,
                            save_secs=self._config.save_checkpoints_secs,
                            save_steps=self._config.save_checkpoints_steps,
                            scaffold=scaffold)
                    ]
            if self._config.save_summary_steps:
                saver_hook_exists = any([
                    isinstance(h, plx_hooks.StepSummarySaverHook)
                    for h in (all_hooks + chief_hooks +
                              list(estimator_spec.training_chief_hooks))
                ])
                if not saver_hook_exists:
                    chief_hooks += [
                        plx_hooks.StepSummarySaverHook(
                            scaffold=scaffold,
                            save_steps=self._config.save_summary_steps,
                            output_dir=self._model_dir,
                        )
                    ]

            with monitored_session.MonitoredTrainingSession(
                    master=self._config.master,
                    is_chief=self._config.is_chief,
                    checkpoint_dir=self._model_dir,
                    scaffold=scaffold,
                    hooks=all_hooks,
                    chief_only_hooks=chief_hooks +
                    list(estimator_spec.training_chief_hooks),
                    save_checkpoint_secs=
                    0,  # Saving checkpoint is handled by a hook.
                    save_summaries_steps=
                    0,  # Saving summaries is handled by a hook.
                    config=self._session_config) as mon_sess:
                loss = None
                while not mon_sess.should_stop():
                    _, loss = mon_sess.run(
                        [estimator_spec.train_op, estimator_spec.loss])
            return loss
Exemple #19
0
def train(train_op,
          logdir,
          master='',
          is_chief=True,
          scaffold=None,
          hooks=None,
          chief_only_hooks=None,
          save_checkpoint_secs=600,
          save_summaries_steps=100,
          config=None):
    """Runs the training loop.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where the graph and checkpoints are saved.
    master: The URL of the master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    scaffold: An tf.train.Scaffold instance.
    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
      training loop.
    chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run
      inside the training loop for the chief trainer only.
    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
      using a default checkpoint saver. If `save_checkpoint_secs` is set to
      `None`, then the default checkpoint saver isn't used.
    save_summaries_steps: The frequency, in number of global steps, that the
      summaries are written to disk using a default summary saver. If
      `save_summaries_steps` is set to `None`, then the default summary saver
      isn't used.
    config: An instance of `tf.ConfigProto`.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `logdir` is `None` and either `save_checkpoint_secs` or
    `save_summaries_steps` are `None.
  """
    if logdir is None and is_chief:
        if save_summaries_steps:
            raise ValueError(
                'logdir cannot be None when save_summaries_steps is not None')

        if save_checkpoint_secs:
            raise ValueError(
                'logdir cannot be None when save_checkpoint_secs is not None')

    with monitored_session.MonitoredTrainingSession(
            master=master,
            is_chief=is_chief,
            checkpoint_dir=logdir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=save_checkpoint_secs,
            save_summaries_steps=save_summaries_steps,
            config=config) as session:
        loss = None
        while not session.should_stop():
            loss = session.run(train_op)
    return loss
Exemple #20
0
def train(train_op,
          logdir,
          master='',
          is_chief=True,
          scaffold=None,
          hooks=None,
          chief_only_hooks=None,
          save_checkpoint_secs=600,
          save_summaries_steps=100,
          config=None,
          max_wait_secs=7200,
          run_metadata=None):
    """Runs the training loop.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where the graph and checkpoints are saved.
    master: The URL of the master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    scaffold: An tf.train.Scaffold instance.
    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
      training loop.
    chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run
      inside the training loop for the chief trainer only.
    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
      using a default checkpoint saver. If `save_checkpoint_secs` is set to
      `None`, then the default checkpoint saver isn't used.
    save_summaries_steps: The frequency, in number of global steps, that the
      summaries are written to disk using a default summary saver. If
      `save_summaries_steps` is set to `None`, then the default summary saver
      isn't used.
    config: An instance of `tf.ConfigProto`.
    max_wait_secs: Maximum time workers should wait for the session to
      become available. This should be kept relatively short to help detect
      incorrect code, but sometimes may need to be increased if the chief takes
      a while to start up.
    run_metadata: A [`RunMetadata`] protocol buffer.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `logdir` is `None` and either `save_checkpoint_secs` or
    `save_summaries_steps` are `None.
  """
    if logdir is None and is_chief:
        if save_summaries_steps:
            raise ValueError(
                'logdir cannot be None when save_summaries_steps is not None')

        if save_checkpoint_secs:
            raise ValueError(
                'logdir cannot be None when save_checkpoint_secs is not None')

    with monitored_session.MonitoredTrainingSession(
            master=master,
            is_chief=is_chief,
            checkpoint_dir=logdir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=save_checkpoint_secs,
            save_summaries_steps=save_summaries_steps,
            config=config,
            max_wait_secs=max_wait_secs) as session:
        loss = None
        while not session.should_stop():
            loss = session.run(train_op, run_metadata=run_metadata)
    return loss
Exemple #21
0
    def common_minimize_trainable(self, base_opt, test_opt, name):
        if test_util.is_gpu_available():
            keys_type_list = [dtypes.int64]
        else:
            keys_type_list = [dtypes.int64, dtypes.string]
        deo.enable_train_mode()
        for run_id, num_shards, k_dtype, d_dtype, initial_mode, dim, run_step \
            in _next_run_step_config(keys_type_list):
            with ops.Graph().as_default():
                raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                if k_dtype == dtypes.string:
                    raw_init_ids = [str(i) for i in raw_init_ids]
                raw_init_vals = [
                    [
                        x,
                    ] * dim for x in
                    [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81]
                ]
                raw_ids_py = [1, 3, 3, 9]
                raw_ids_nn = constant_op.constant(raw_ids_py,
                                                  dtype=dtypes.int64)
                raw_ids_de = raw_ids_nn
                if k_dtype == dtypes.string:
                    raw_ids_de = constant_op.constant(
                        [str(i) for i in raw_ids_py], dtype=k_dtype)
                sp_ids_nn = sparse_tensor.SparseTensor(indices=[
                    [0, 0],
                    [0, 1],
                    [1, 0],
                    [2, 1],
                ],
                                                       values=raw_ids_nn,
                                                       dense_shape=[3, 2])
                sp_ids_de = sparse_tensor.SparseTensor(indices=[
                    [0, 0],
                    [0, 1],
                    [1, 0],
                    [2, 1],
                ],
                                                       values=raw_ids_de,
                                                       dense_shape=[3, 2])
                x = constant_op.constant([[_x * dim]
                                          for _x in [[0.4], [0.5], [0.6]]],
                                         dtype=d_dtype)
                x = array_ops.reshape(x, shape=(3 * dim, 1))
                # base var prepare
                base_var = variables.Variable(np.array(raw_init_vals).reshape(
                    [len(raw_init_ids), dim]),
                                              dtype=d_dtype,
                                              shape=[len(raw_init_ids), dim])

                # test var prepare
                embeddings = deo.get_variable('t1030-' + name + str(run_id),
                                              key_dtype=k_dtype,
                                              value_dtype=d_dtype,
                                              devices=_get_devices() *
                                              num_shards,
                                              initializer=1.,
                                              dim=dim)

                init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype)
                init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype)
                init_op = embeddings.upsert(init_ids, init_vals)

                # base branch
                base_embedding = embedding_ops.embedding_lookup_sparse(
                    base_var, sp_ids_nn, None, combiner='sum')
                base_embedding = array_ops.reshape(base_embedding,
                                                   shape=[1, 3 * dim])
                pred0 = math_ops.matmul(base_embedding, x)
                loss0 = pred0 * pred0

                base_opt_op = base_opt.minimize(loss0, var_list=[base_var])

                # test branch
                test_var, trainable = deo.embedding_lookup_sparse(
                    embeddings,
                    sp_ids_de,
                    sp_weights=None,
                    combiner="sum",
                    return_trainable=True)

                pred1 = math_ops.matmul(
                    array_ops.reshape(test_var, shape=[1, 3 * dim]), x)
                loss1 = pred1 * pred1

                gstep = training_util.create_global_step()
                test_opt_op = test_opt.minimize(loss1,
                                                var_list=[trainable],
                                                global_step=gstep)

                table_var = array_ops.reshape(embeddings.lookup(init_ids),
                                              shape=[10, dim])

                with monitored_session.MonitoredTrainingSession(
                        is_chief=True, config=default_config) as sess:
                    sess.run(init_op)
                    self.assertAllCloseAccordingToType(
                        np.array(raw_init_vals).reshape(
                            [len(raw_init_ids), dim]), sess.run(base_var))

                    # run base
                    for _ in range(run_step):
                        sess.run(base_opt_op)
                        sess.run(test_opt_op)

                    # Validate global_step
                    self.assertEqual(run_step, sess.run(gstep))

                    # Validate updated params
                    self.assertAllCloseAccordingToType(
                        sess.run(base_var),
                        sess.run(table_var),
                        msg="Cond:{},{},{},{},{}".format(
                            num_shards, k_dtype, d_dtype, dim, run_step))
                    self.device_check(embeddings)