Exemple #1
0
    def _train_model(self, input_fn, hooks, saving_listeners):
        worker_hooks = []
        with ops.Graph().as_default() as g, g.device(self._device_fn):
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step_tensor = self._create_and_assert_global_step(g)
            global_step_read_tensor = training_util._get_or_create_global_step_read(
            )  # pylint: disable=protected-access
            with ops.control_dependencies([global_step_read_tensor]):
                features, labels = self._get_features_and_labels_from_input_fn(
                    input_fn, model_fn_lib.ModeKeys.TRAIN)
            estimator_spec = self._call_model_fn(features, labels,
                                                 model_fn_lib.ModeKeys.TRAIN,
                                                 self.config)
            # Check if the user created a loss summary, and add one if they didn't.
            # We assume here that the summary is called 'loss'. If it is not, we will
            # make another one with the name 'loss' to ensure it shows up in the right
            # graph in TensorBoard.
            if not any([
                    x.op.name == 'loss'
                    for x in ops.get_collection(ops.GraphKeys.SUMMARIES)
            ]):
                summary.scalar('loss', estimator_spec.loss)
            ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
            worker_hooks.extend(hooks)
            worker_hooks.extend([
                training.NanTensorHook(estimator_spec.loss),
                training.LoggingTensorHook(
                    {
                        'loss': estimator_spec.loss,
                        'step': global_step_tensor
                    },
                    every_n_iter=100)
            ])
            worker_hooks.extend(estimator_spec.training_hooks)

            if not (estimator_spec.scaffold.saver
                    or ops.get_collection(ops.GraphKeys.SAVERS)):
                ops.add_to_collection(
                    ops.GraphKeys.SAVERS,
                    training.Saver(
                        sharded=True,
                        max_to_keep=self._config.keep_checkpoint_max,
                        keep_checkpoint_every_n_hours=(
                            self._config.keep_checkpoint_every_n_hours),
                        defer_build=True,
                        save_relative_paths=True))

            chief_hooks = []
            all_hooks = worker_hooks + list(
                estimator_spec.training_chief_hooks)
            saver_hooks = [
                h for h in all_hooks
                if isinstance(h, training.CheckpointSaverHook)
            ]
            if (self._config.save_checkpoints_secs
                    or self._config.save_checkpoints_steps):
                if not saver_hooks:
                    chief_hooks = [
                        training.CheckpointSaverHook(
                            self._model_dir,
                            save_secs=self._config.save_checkpoints_secs,
                            save_steps=self._config.save_checkpoints_steps,
                            scaffold=estimator_spec.scaffold)
                    ]
                    saver_hooks = [chief_hooks[0]]
            if saving_listeners:
                if not saver_hooks:
                    raise ValueError(
                        'There should be a CheckpointSaverHook to use saving_listeners. '
                        'Please set one of the RunConfig.save_checkpoints_steps or '
                        'RunConfig.save_checkpoints_secs.')
                else:
                    # It is expected to have one CheckpointSaverHook. If multiple, we pick
                    # up the first one to add listener.
                    saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
            with training.MonitoredTrainingSession(
                    master=self._config.master,
                    is_chief=self._config.is_chief,
                    checkpoint_dir=self._model_dir,
                    scaffold=estimator_spec.scaffold,
                    hooks=worker_hooks,
                    chief_only_hooks=(
                        tuple(chief_hooks) +
                        tuple(estimator_spec.training_chief_hooks)),
                    save_checkpoint_secs=0,  # Saving is handled by a hook.
                    save_summaries_steps=self._config.save_summary_steps,
                    config=self._session_config,
                    log_step_count_steps=self._config.log_step_count_steps
            ) as mon_sess:
                loss = None
                while not mon_sess.should_stop():
                    _, loss = mon_sess.run(
                        [estimator_spec.train_op, estimator_spec.loss])
            return loss
Exemple #2
0
    def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
        d, master_target, config = self._get_test_object(
            task_type, task_id, num_gpus)
        with ops.Graph().as_default(), \
             self.cached_session(config=config,
                                 target=master_target) as sess, \
             d.scope():
            l = core.Dense(1,
                           use_bias=False,
                           name='gpu_%d' % d.extended._num_gpus_per_worker)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = constant_op.constant([[1.]])

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=[one])
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d.extended._num_gpus_per_worker:
                return True

            sess.run(variables.global_variables_initializer())

            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
Exemple #3
0
 def testInvalidShape(self):
   init = init_ops.identity_initializer()
   with self.session(graph=ops.Graph(), use_gpu=True):
     self.assertRaises(ValueError, init, shape=[5, 7, 7])
     self.assertRaises(ValueError, init, shape=[5])
     self.assertRaises(ValueError, init, shape=[])
Exemple #4
0
    def _testScopedImport(self, test_dir, exported_filenames):
        graph = ops.Graph()
        # Create all the missing inputs.
        with graph.as_default():
            new_image = constant_op.constant(1.2,
                                             dtypes.float32,
                                             shape=[100, 28],
                                             name="images")

        with self.assertRaisesRegexp(ValueError,
                                     "Graph contains unbound inputs"):
            meta_graph.import_scoped_meta_graph(os.path.join(
                test_dir, exported_filenames[0]),
                                                graph=graph,
                                                import_scope="new_hidden1")

        with self.assertRaisesRegexp(ValueError,
                                     "Graph contains unbound inputs"):
            meta_graph.import_scoped_meta_graph(
                os.path.join(test_dir, exported_filenames[0]),
                graph=graph,
                input_map={"image:0": new_image},
                import_scope="new_hidden1")

        # Verifies we can import the original "hidden1" into "new_hidden1".
        var_list = meta_graph.import_scoped_meta_graph(
            os.path.join(test_dir, exported_filenames[0]),
            graph=graph,
            input_map={"$unbound_inputs_images": new_image},
            import_scope="new_hidden1")

        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
        new_var_names = [v.name for _, v in var_list.items()]
        self.assertEqual(["new_hidden1/biases:0", "new_hidden1/weights:0"],
                         sorted(new_var_names))

        # Verifies we can import the original "hidden2" into "new_hidden2".
        hidden1 = array_ops.identity(
            graph.as_graph_element("new_hidden1/Relu:0"), name="hidden1/Relu")
        var_list = meta_graph.import_scoped_meta_graph(
            os.path.join(test_dir, exported_filenames[1]),
            graph=graph,
            input_map={"$unbound_inputs_hidden1/Relu": hidden1},
            import_scope="new_hidden2",
            unbound_inputs_col_name=None)

        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
        new_var_names = [v.name for _, v in var_list.items()]
        self.assertEqual(["new_hidden2/biases:0", "new_hidden2/weights:0"],
                         sorted(new_var_names))

        # Verifies we can import the original "softmax_linear" into
        # "new_softmax_linear".
        hidden2 = array_ops.identity(
            graph.as_graph_element("new_hidden2/Relu:0"), name="hidden2/Relu")
        var_list = meta_graph.import_scoped_meta_graph(
            os.path.join(test_dir, exported_filenames[2]),
            graph=graph,
            input_map={"$unbound_inputs_hidden2/Relu": hidden2},
            import_scope="new_softmax_linear",
            unbound_inputs_col_name=None)
        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
        new_var_names = [v.name for _, v in var_list.items()]
        self.assertEqual(
            ["new_softmax_linear/biases:0", "new_softmax_linear/weights:0"],
            sorted(new_var_names))

        # Exports the scoped meta graphs again.
        new_meta_graph1, var_list = meta_graph.export_scoped_meta_graph(
            graph=graph, export_scope="new_hidden1")
        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))

        new_meta_graph2, var_list = meta_graph.export_scoped_meta_graph(
            graph=graph,
            export_scope="new_hidden2",
            unbound_inputs_col_name=None)
        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))

        new_meta_graph3, var_list = meta_graph.export_scoped_meta_graph(
            graph=graph,
            export_scope="new_softmax_linear",
            unbound_inputs_col_name=None)
        self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))

        return [new_meta_graph1, new_meta_graph2, new_meta_graph3]
 def testShapeInferenceUnknownShape(self):
     with ops.Graph().as_default():
         indices = array_ops.placeholder(dtypes.int64)
         shape = array_ops.placeholder(dtypes.int64)
         output = sparse_ops.sparse_to_dense(indices, shape, 1, 0)
         self.assertIsNone(output.get_shape().ndims)
    def _test_device_assignment_local(self,
                                      d,
                                      compute_device='CPU',
                                      variable_device='CPU',
                                      num_gpus=0):
        with ops.Graph().as_default(), \
             self.cached_session(target=self._default_target,
                                 config=self._sess_config) as sess, \
             d.scope():

            def model_fn():
                if 'CPU' in compute_device:
                    replica_compute_device = '/device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    replica_compute_device = ('/device:GPU:%d' % replica_id)
                replica_compute_device = device_util.canonicalize(
                    replica_compute_device)

                if 'CPU' in variable_device:
                    replica_variable_device = '/device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    replica_variable_device = ('/device:GPU:%d' % replica_id)
                replica_variable_device = device_util.canonicalize(
                    replica_variable_device)

                a = constant_op.constant(1.0)
                b = constant_op.constant(2.0)
                c = a + b
                self.assertEqual(a.device, replica_compute_device)
                self.assertEqual(b.device, replica_compute_device)
                self.assertEqual(c.device, replica_compute_device)

                # The device scope is ignored for variables but not for normal ops.
                with ops.device('/device:GPU:2'):
                    x = variable_scope.get_variable(
                        'x',
                        initializer=10.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                    x_add = x.assign_add(c)
                    e = a + c
                self.assertEqual(device_util.canonicalize(x.device),
                                 replica_variable_device)
                self.assertEqual(x_add.device, x.device)
                self.assertEqual(e.device,
                                 device_util.canonicalize('/device:GPU:2'))

                # The colocate_vars_with can override the distribution's device.
                with d.colocate_vars_with(x):
                    y = variable_scope.get_variable(
                        'y',
                        initializer=20.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                # We add an identity here to avoid complaints about summing
                # non-distributed values.
                y_add = y.assign_add(array_ops.identity(x_add))
                self.assertEqual(device_util.canonicalize(y.device),
                                 replica_variable_device)
                self.assertEqual(y_add.device, y.device)
                self.assertEqual(y.device, x.device)

                z = variable_scope.get_variable(
                    'z',
                    initializer=10.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                self.assertEqual(device_util.canonicalize(z.device),
                                 replica_variable_device)

                with ops.control_dependencies([y_add]):
                    # We add an identity here to avoid complaints about summing
                    # non-distributed values.
                    z_add = z.assign_add(array_ops.identity(y))
                with ops.control_dependencies([z_add]):
                    f = z + c
                self.assertEqual(f.device, replica_compute_device)

                # The device scope would merge with the default worker device.
                with ops.device('/CPU:1'):
                    g = e + 1.0
                self.assertEqual(g.device,
                                 device_util.canonicalize('/device:CPU:1'))

                # Ths ops.colocate_with will be ignored when defining a variale but not
                # for a normal tensor.
                with ops.colocate_with(x):
                    u = variable_scope.get_variable('u', initializer=30.0)
                    h = f + 1.0
                self.assertEqual(device_util.canonicalize(u.device),
                                 replica_variable_device)
                self.assertEqual(device_util.canonicalize(x.device),
                                 device_util.canonicalize(h.device))
                return y_add, z_add, f

            y, z, f = d.call_for_each_replica(model_fn)
            self.assertNotEqual(y, None)
            self.assertNotEqual(z, None)
            self.assertNotEqual(f, None)

            if context.num_gpus() >= 1 and num_gpus <= 1:
                variables.global_variables_initializer().run()
                y_val, z_val, f_val = sess.run([y, z, f])
                self.assertEqual(y_val, 33.0)
                self.assertEqual(z_val, 43.0)
                self.assertEqual(f_val, 46.0)
    def _test_minimize_loss_graph(self,
                                  task_type,
                                  task_id,
                                  num_gpus,
                                  use_core_strategy=False):
        d, master_target, sess_config = self._get_test_objects(
            task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
        if task_type:
            # Multi-worker
            assert hasattr(d.extended,
                           '_cluster_spec') and d.extended._cluster_spec
            num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
            if CHIEF in d.extended._cluster_spec.as_dict():
                num_workers += 1
        else:
            # local
            num_workers = 1

        with ops.Graph().as_default(), \
             self.cached_session(target=master_target,
                                 config=sess_config) as sess, \
             d.scope():
            l = core.Dense(1, use_bias=False)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.call_for_each_replica(grad_fn, args=(one, ))
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.update(v, update, g, grouped=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d.extended._num_gpus_per_worker:
                return True

            if (not task_type or multi_worker_util.is_chief(
                    d.extended._cluster_spec, task_type, task_id)):
                variables.global_variables_initializer().run()

            # Workers waiting for chief worker's initializing variables.
            self._init_condition.acquire()
            self._init_reached += 1
            while self._init_reached != num_workers:
                self._init_condition.wait()
            self._init_condition.notify_all()
            self._init_condition.release()

            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
            return error_after < error_before
    def testPartitionedVariableAssignments(self):
        with ops.Graph().as_default(), self.cached_session():
            v0 = variables.Variable(initial_value=[0.0])
            v1 = variables.Variable(initial_value=[1.0])
            v2 = variables.Variable(initial_value=[20.0])
            v3 = variables.Variable(initial_value=[30.0])
            v0._set_save_slice_info(
                variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
            v1._set_save_slice_info(
                variables.Variable.SaveSliceInfo(v1.name, [2], [1], [1]))
            v2._set_save_slice_info(
                variables.Variable.SaveSliceInfo(v2.name, [2], [0], [1]))
            v3._set_save_slice_info(
                variables.Variable.SaveSliceInfo(v3.name, [2], [1], [1]))

            partitions = [2]

            # Pass variable_list as [v1, v0] to ensure they are properly
            # re-sorted to [v0, v1] based on their slice info offsets.
            pv_0 = variables.PartitionedVariable(name="two_vars",
                                                 shape=[2],
                                                 dtype=v0.dtype,
                                                 variable_list=[v0, v1],
                                                 partitions=partitions)

            pv_1 = variables.PartitionedVariable(name="two_vars",
                                                 shape=[2],
                                                 dtype=v0.dtype,
                                                 variable_list=[v2, v3],
                                                 partitions=partitions)

            deltas_a = constant_op.constant([1.0, 2.0])
            deltas_b = constant_op.constant([3.0, 4.0])
            ones = array_ops.ones([2])
            plus_delta = pv_0.assign_add(deltas_a)
            minus_delta = pv_0.assign_sub(deltas_b)
            assign_ones = pv_0.assign(ones)

            c_0 = constant_op.constant([2.0])
            c_1 = constant_op.constant([3.0])
            assign_list = pv_1.assign([c_0, c_1])
            assign_part_value = pv_1.assign_add(assign_ones)
            assign_part_var = pv_1.assign_sub(pv_0)
            variables.global_variables_initializer().run()

            self.assertEqual([1.0], plus_delta[0].eval())
            self.assertEqual([1.0], self.evaluate(v0))
            self.assertEqual([3.0], plus_delta[1].eval())
            self.assertEqual([3.0], self.evaluate(v1))

            self.assertEqual([-2.0], minus_delta[0].eval())
            self.assertEqual([-2.0], self.evaluate(v0))
            self.assertEqual([-1.0], minus_delta[1].eval())
            self.assertEqual([-1.0], self.evaluate(v1))

            self.assertEqual([1.0], assign_ones[0].eval())
            self.assertEqual([1.0], self.evaluate(v0))
            self.assertEqual([1.0], assign_ones[1].eval())
            self.assertEqual([1.0], self.evaluate(v1))

            self.assertEqual([2.0], assign_list[0].eval())
            self.assertEqual([2.0], self.evaluate(v2))
            self.assertEqual([3.0], assign_list[1].eval())
            self.assertEqual([3.0], self.evaluate(v3))

            self.assertEqual([3.0], assign_part_value[0].eval())
            self.assertEqual([3.0], self.evaluate(v2))
            self.assertEqual([4.0], assign_part_value[1].eval())
            self.assertEqual([4.0], self.evaluate(v3))

            self.assertEqual([2.0], assign_part_var[0].eval())
            self.assertEqual([2.0], self.evaluate(v2))
            self.assertEqual([3.0], assign_part_var[1].eval())
            self.assertEqual([3.0], self.evaluate(v3))
Exemple #9
0
def _export_mode(mode, has_saved_vars, builder, model, custom_objects,
                 checkpoint_path, input_signature):
    """Exports a model, and optionally saves new vars from the clone model.

  Args:
    mode: A `tf.estimator.ModeKeys` string.
    has_saved_vars: A `boolean` indicating whether the SavedModel has already
      exported variables.
    builder: A `SavedModelBuilder` object.
    model: A `tf.keras.Model` object.
    custom_objects: A dictionary mapping string names to custom classes
      or functions.
    checkpoint_path: String path to checkpoint.
    input_signature: Nested TensorSpec containing the expected inputs. Can be
      `None`, in which case the signature will be inferred from the model.

  Raises:
    ValueError: If the train/eval mode is being exported, but the model does
      not have an optimizer.
  """
    from tensorflow.python.keras import models as models_lib  # pylint: disable=g-import-not-at-top
    compile_clone = (mode != mode_keys.ModeKeys.PREDICT)
    if compile_clone and not model.optimizer:
        raise ValueError(
            'Model does not have an optimizer. Cannot export mode %s' % mode)

    model_graph = ops.get_default_graph()
    with ops.Graph().as_default() as g, K.learning_phase_scope(
            mode == mode_keys.ModeKeys.TRAIN):

        if input_signature is None:
            input_tensors = None
        else:
            input_tensors = nest.map_structure(create_placeholder,
                                               input_signature)

        # Clone the model into blank graph. This will create placeholders for inputs
        # and targets.
        clone = models_lib.clone_and_build_model(model,
                                                 input_tensors=input_tensors,
                                                 custom_objects=custom_objects,
                                                 compile_clone=compile_clone)

        # Make sure that iterations variable is added to the global step collection,
        # to ensure that, when the SavedModel graph is loaded, the iterations
        # variable is returned by `tf.compat.v1.train.get_global_step()`. This is
        # required for compatibility with the SavedModelEstimator.
        if compile_clone:
            g.add_to_collection(ops.GraphKeys.GLOBAL_STEP,
                                clone.optimizer.iterations)

        # Extract update and train ops from train/test/predict functions.
        train_op = None
        if mode == mode_keys.ModeKeys.TRAIN:
            clone._make_train_function()
            train_op = clone.train_function.updates_op
        elif mode == mode_keys.ModeKeys.TEST:
            clone._make_test_function()
        else:
            clone._make_predict_function()
        g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(
            clone.state_updates)

        with session.Session().as_default():
            clone_var_list = _get_var_list(clone)
            if has_saved_vars:
                # Confirm all variables in the clone have an entry in the checkpoint.
                status = clone.load_weights(checkpoint_path)
                status.assert_existing_objects_matched()
            else:
                # Confirm that variables between the clone and model match up exactly,
                # not counting optimizer objects. Optimizer objects are ignored because
                # if the model has not trained, the slot variables will not have been
                # created yet.
                # TODO(b/113179535): Replace with trackable equivalence.
                _assert_same_non_optimizer_objects(model, model_graph, clone,
                                                   g)

                # TODO(b/113178242): Use value transfer for trackable objects.
                clone.load_weights(checkpoint_path)

                # Add graph and variables to SavedModel.
                # TODO(b/113134168): Switch to add_meta_graph_and_variables.
                clone.save_weights(checkpoint_path,
                                   save_format='tf',
                                   overwrite=True)
                builder._has_saved_variables = True

            # Add graph to the SavedModel builder.
            builder.add_meta_graph(
                model_utils.EXPORT_TAG_MAP[mode],
                signature_def_map=_create_signature_def_map(clone, mode),
                saver=saver_lib.Saver(
                    clone_var_list,
                    # Allow saving Models with no variables. This is somewhat odd, but
                    # it's not necessarily a bug.
                    allow_empty=True),
                init_op=variables.local_variables_initializer(),
                train_op=train_op)
        return None
 def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
         v = variables.Variable(array_ops.zeros([0, 2]), name="v")
         uninited = variables.report_uninitialized_variables()
         v.initializer.run()  # not strictly necessary
         self.assertEqual(0, self.evaluate(uninited).size)
 def testNoVars(self):
     with ops.Graph().as_default():
         self.assertEqual(None, variables.assert_variables_initialized())
 def testNoVars(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
         uninited = variables.report_uninitialized_variables()
         self.assertEqual(0, self.evaluate(uninited).size)
def _query_tpu_system_metadata(master_address, cluster_def=None,
                               query_topology=False):
  """Automatically detects the TPU system metadata in the system."""
  tpu_core_count = 0
  devices = []
  device_dict = collections.defaultdict(list)

  # TODO(b/120564445): Replace with standard library for retries.
  retry_count = 1
  while True:
    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
                 master_address)
    try:
      with ops.Graph().as_default():
        with session_lib.Session(
            master_address,
            config=get_session_config_with_timeout(
                _PINGING_MASTER_TIMEOUT_IN_MS,
                cluster_def)) as sess:
          devices = sess.list_devices()
          for device in devices:
            match = _TPU_DEVICE_REG.match(device.name)
            if match:
              host_id = match.group(1)
              core_id = match.group(2)
              device_dict[host_id].append(core_id)
              tpu_core_count += 1
          break
    except errors.DeadlineExceededError:
      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
             'not be ready (still scheduling) or the Tensorflow master address '
             'is incorrect: got (%s).' %
             (master_address))

      # TODO(xiejw): For local or grpc master we might not need retry logic
      # here.
      if retry_count <= _RETRY_TIMES:
        logging.warning('%s', msg)
        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
        retry_count += 1
      else:
        raise ValueError(msg)

  num_of_cores_per_host = 0
  if tpu_core_count:
    num_cores_per_host_set = set(
        [len(core_ids) for core_ids in device_dict.values()])
    if len(num_cores_per_host_set) != 1:
      raise RuntimeError(
          'TPU cores on each host is not same. This should not happen!. '
          'devices: {}'.format(devices))
    num_of_cores_per_host = num_cores_per_host_set.pop()

  topology = None
  if query_topology:
    if not tpu_core_count:
      raise RuntimeError(
          'Cannot find any TPU cores in the system (master address {}). '
          'This usually means the master address is incorrect or the '
          'TPU worker has some problems. Available devices: {}'.format(
              master_address, devices))

    topology = _obtain_topology(master_address, cluster_def)

  metadata = _TPUSystemMetadata(
      num_cores=tpu_core_count,
      num_hosts=len(device_dict),
      num_of_cores_per_host=num_of_cores_per_host,
      topology=topology,
      devices=devices)

  if tpu_core_count:
    logging.info('Found TPU system:')
    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
    logging.info('*** Num TPU Cores Per Worker: %d',
                 metadata.num_of_cores_per_host)
    for device in metadata.devices:
      logging.info('*** Available Device: %s', device)
  else:
    logging.info('Failed to find TPU: %s', metadata)
  return metadata
Exemple #14
0
    def _evaluate_model(self,
                        input_fn,
                        hooks=None,
                        checkpoint_path=None,
                        name=''):
        """Evaluates the model using the training.evaluation library."""
        # Check that model has been trained (if nothing has been set explicitly).
        if not checkpoint_path:
            latest_path = saver.latest_checkpoint(self._model_dir)
            if not latest_path:
                raise ValueError(
                    'Could not find trained model in model_dir: {}.'.format(
                        self._model_dir))
            checkpoint_path = latest_path

        # Setup output directory.
        eval_dir = os.path.join(self._model_dir,
                                'eval' if not name else 'eval_' + name)

        with ops.Graph().as_default() as g:
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step_tensor = self._create_and_assert_global_step(g)
            features, labels = self._get_features_and_labels_from_input_fn(
                input_fn, model_fn_lib.ModeKeys.EVAL)
            estimator_spec = self._call_model_fn(features, labels,
                                                 model_fn_lib.ModeKeys.EVAL,
                                                 self.config)

            if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
                raise ValueError(
                    'Metric with name "%s" is not allowed, because Estimator '
                    % (model_fn_lib.LOSS_METRIC_KEY) +
                    'already defines a default metric with the same name.')
            estimator_spec.eval_metric_ops[
                model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(
                    estimator_spec.loss)

            update_op, eval_dict = _extract_metric_update_ops(
                estimator_spec.eval_metric_ops)

            if ops.GraphKeys.GLOBAL_STEP in eval_dict:
                raise ValueError(
                    'Metric with name `global_step` is not allowed, because Estimator '
                    'already defines a default metric with the same name.')
            eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor

            all_hooks = list(hooks or [])
            all_hooks.extend(list(estimator_spec.evaluation_hooks or []))

            eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
                checkpoint_path=checkpoint_path,
                master=self._config.evaluation_master,
                scaffold=estimator_spec.scaffold,
                eval_ops=update_op,
                final_ops=eval_dict,
                hooks=all_hooks,
                config=self._session_config)

            _write_dict_to_summary(
                output_dir=eval_dir,
                dictionary=eval_results,
                current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])

        return eval_results
Exemple #15
0
def define_function(func, input_types):
    """Creates a `FunctionDef` for a python function.

  `func` is a Python function that receives zero or more tensors and returns at
  least one tensor.  It should add ops to the default graph the usual way by
  calling TensorFlow functions such as `tf.constant()`, `tf.matmul()`, etc.

  `input_types` is a dictionary of strings to `tf.Dtype` objects.  Keys are
  names arguments to `func`.  The value indicate the type of tensor expected
  by the function.

  The returned `FunctionDef` protocol buffer is also added to the
  default graph library.  After it has been added you can add calls to
  the function by passing it to `tf.call_function()`, together with a
  list of tensors to use as inputs for the function.

  Notes:

  *  `func` is called once, with `placeholder` tensors of the types specified in
     `input_types` as arguments.
  *  Values returned by `func` must be tensors and they are recorded as being
     the output of the function def.
  *  While `func` is a called, an empty graph is temporarily pushed as the
     default graph.  All ops added by `func` to that graph are part of the body
     of the returned function def.

  Example, but also see the [How To on functions](link_needed).

  ```python
  # A function that receives two tensors x, y and returns their
  # sum and difference.
  def my_func(x, y):
    return x + y, x - y

  # Create a FunctionDef for 'my_func'. (This does not change the default
  graph.)
  my_func_def = tf.define_function(my_func, {'x': tf.float32, 'y': tf.float32})

  # Build the graph, calling the function.
  a = tf.constant([1.0])
  b = tf.constant([2.0])
  c, d = tf.call_function(my_func_def, a, b, name='mycall')
  ```

  Args:
    func: a Python function.
    input_types: dict.  Keys are the names of the arguments of `func`, values
      are their expected `tf.DType`.

  Returns:
    A FunctionDef protocol buffer.

  Raises:
    ValueError: if the arguments are invalid.

  """
    # TODO(touts): Lift the limitation that func can only receive Tensor args.
    if inspect.isfunction(func):
        func_name = func.__name__
    elif inspect.ismethod(func):
        func_name = func.__self__.__name__ + "." + func.__name__
    else:
        raise ValueError("Argument must be a function")
    argspec = inspect.getargspec(func)
    if argspec.varargs or argspec.keywords or argspec.defaults:
        raise ValueError("Only functions with plain arglists are supported.")
    if inspect.isfunction(func):
        if len(argspec.args) != len(input_types):
            raise ValueError(
                "The function must have the same number of arguments "
                "as the number of specified input types.")
        args = argspec.args
    elif inspect.ismethod(func):
        if len(argspec.args) != 1 + len(input_types):
            raise ValueError(
                "The class function must have the same number of arguments "
                "as the number of specified input types.")
        args = argspec.args[1:]  # 1st argument is the "class" type.

    # Create the func_def object.
    temp_graph = ops.Graph()
    with temp_graph.as_default():
        # List of placeholders for the function_def.
        inputs = []
        # Arglist to call 'func'
        kwargs = {}
        for argname in args:
            if argname not in input_types:
                raise ValueError("Missing type for argument: " + argname)
            argholder = array_ops.placeholder(input_types[argname],
                                              name=argname)
            inputs.append(argholder)
            kwargs[argname] = argholder
        # Call func and gather the output tensors.
        outputs = func(**kwargs)
        if not outputs:
            raise ValueError("Function must return at least one tensor")
        # Convenience: if func only returned one value, make it a tuple.
        if not isinstance(outputs, (list, tuple)):
            outputs = (outputs, )
    # Build the FunctionDef
    func_def = graph_to_function_def(temp_graph, func_name, inputs, outputs)
    g = ops.get_default_graph()
    g._add_function(func_def)  # pylint: disable=protected-access
    return func_def
Exemple #16
0
    def _testGradient(self, inputs, weights, expected_jacobians_wrt_input,
                      lattice_sizes, is_hypercube):
        """Compute the grad_wrt_input and compare it with expected_grad_wrt_input.

    Args:
      inputs: a 2D array (or numpy array) contains the test inputs. Its shape
        should be num_examples x input_size.
      weights: a 2D array (or numpy array) contains the test weights. Its
        shape should be num_examples x weight_size.
      expected_jacobians_wrt_input: 3D array (or numpy) contains  a transpoed
        jacobian matrix that contains dweight/dinput with shape (num_examples,
        weight_size, input_size).
        In other words, expected_jacobians_wrt_input[num][ii][jj] ==
          dweight[num][jj]/dinput[num][ii], where num means the current example.
      lattice_sizes: A list of lattice_sizes.
      is_hypercube: If true, hypercube gradient is tested, otherwise simplex
        gradient is tested.

    Returns: None

    Raises: Fails if computed jacobian_wrt_inputs != expected_jacobian_wrt_inpu.
    """

        # Number of test examples in inputs.
        num_examples = len(inputs)
        weight_size = len(weights[0])

        # Define the grad_wrt_input_tensor.
        with ops.Graph().as_default():
            input_tensor = constant_op.constant(inputs, dtype=dtypes.float32)
            weight_tensor = constant_op.constant(weights, dtype=dtypes.float32)
            grad_wrt_weight_tensor = array_ops.placeholder(
                dtype=dtypes.float32, shape=(num_examples, weight_size))

            if is_hypercube:
                grad_wrt_input_tensor = lattice_ops.hypercube_gradient(
                    input_tensor, weight_tensor, grad_wrt_weight_tensor,
                    lattice_sizes)
            else:
                grad_wrt_input_tensor = lattice_ops.simplex_gradient(
                    input_tensor, weight_tensor, grad_wrt_weight_tensor,
                    lattice_sizes)

            # Compute the Jacobian.
            with self.test_session(use_gpu=False):
                tf_logging.info("input = %s " % inputs)
                tf_logging.info("weight = %s " % weights)
                # num_examples x weight_size x input_size tensor.
                jacobians_wrt_input = []
                # Compute dweight[cnt] / dinput.
                for cnt in range(weight_size):
                    grad_wrt_weight = [0.] * weight_size
                    grad_wrt_weight[cnt] = 1.0
                    grad_wrt_weights = [
                        grad_wrt_weight for _ in range(num_examples)
                    ]
                    tf_logging.info("grad_wrt_weights = %s " %
                                    grad_wrt_weights)
                    # num_examples x input_size matrix.
                    grad_weight_wrt_inputs = grad_wrt_input_tensor.eval(
                        feed_dict={grad_wrt_weight_tensor: grad_wrt_weights})
                    tf_logging.info("grad_wrt_inputs = %s " %
                                    grad_weight_wrt_inputs)
                    jacobians_wrt_input.append(grad_weight_wrt_inputs)
            tf_logging.info("jacobian_wrt_inputs = %s " % jacobians_wrt_input)
            tf_logging.info("expected_jacobian_wrt_inputs = %s" %
                            expected_jacobians_wrt_input)
            self.assertAllClose(jacobians_wrt_input,
                                expected_jacobians_wrt_input)
    def _test_device_assignment_distributed(self,
                                            task_type,
                                            task_id,
                                            num_gpus,
                                            use_core_strategy=False):
        worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
        d, _, sess_config = self._get_test_objects(
            task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
        with ops.Graph().as_default(), \
             self.cached_session(target=self._default_target,
                                 config=sess_config) as sess, \
             d.scope():

            # Define a variable outside the call_for_each_replica scope.
            n = variable_scope.get_variable('n', initializer=10.0)
            self.assertEqual(n.device, '/job:ps/task:0')

            def model_fn():
                if num_gpus == 0:
                    last_part_device = 'device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    last_part_device = ('device:GPU:%d' % replica_id)

                a = constant_op.constant(1.0)
                b = constant_op.constant(2.0)
                c = a + b
                self.assertEqual(a.device,
                                 worker_device + '/' + last_part_device)
                self.assertEqual(b.device,
                                 worker_device + '/' + last_part_device)
                self.assertEqual(c.device,
                                 worker_device + '/' + last_part_device)

                # The device scope is ignored for variables but not for normal ops.
                with ops.device('/job:worker/task:0'):
                    x = variable_scope.get_variable(
                        'x',
                        initializer=10.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                    x_add = x.assign_add(c)
                    e = a + c
                # The variable x is on the task 1 since the device_function has been
                # called once before the model_fn.
                self.assertEqual(x.device, '/job:ps/task:1')
                self.assertEqual(x_add.device, x.device)
                self.assertEqual(
                    e.device,
                    '/job:worker/replica:0/task:0/%s' % last_part_device)

                # The colocate_vars_with can override the distribution's device.
                with d.colocate_vars_with(x):
                    y = variable_scope.get_variable(
                        'y',
                        initializer=20.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                # We add an identity here to avoid complaints about summing
                # non-distributed values.
                y_add = y.assign_add(array_ops.identity(x_add))
                self.assertEqual(y.device, '/job:ps/task:1')
                self.assertEqual(y_add.device, y.device)
                self.assertEqual(y.device, x.device)

                z = variable_scope.get_variable(
                    'z',
                    initializer=10.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                self.assertEqual(z.device, '/job:ps/task:0')
                self.assertNotEqual(z.device, x.device)

                with ops.control_dependencies([y_add]):
                    # We add an identity here to avoid complaints about summing
                    # non-distributed values.
                    z_add = z.assign_add(array_ops.identity(y))
                with ops.control_dependencies([z_add]):
                    f = z + c
                self.assertEqual(f.device,
                                 worker_device + '/' + last_part_device)

                # The device scope would merge with the default worker device.
                with ops.device('/CPU:1'):
                    g = e + 1.0
                self.assertEqual(g.device, worker_device + '/device:CPU:1')

                # Ths ops.colocate_with will be ignored when defining a variale but not
                # for a normal tensor.
                with ops.colocate_with(x):
                    u = variable_scope.get_variable('u', initializer=30.0)
                    v = variable_scope.get_variable('v', initializer=30.0)
                    h = f + 1.0
                self.assertIn('/job:ps/', u.device)
                self.assertIn('/job:ps/', v.device)
                # u and v are on different parameter servers.
                self.assertTrue(u.device != x.device or v.device != x.device)
                self.assertTrue(u.device == x.device or v.device == x.device)
                # Here h is not on one worker. Note h.device is canonical while x.device
                # is not but.
                self.assertIn('/job:ps/', h.device)
                return y_add, z_add, f

            y, z, f = d.call_for_each_replica(model_fn)
            self.assertNotEqual(y, None)
            self.assertNotEqual(z, None)
            self.assertNotEqual(f, None)

            if context.num_gpus() >= 1 and num_gpus <= 1:
                variables.global_variables_initializer().run()
                y_val, z_val, f_val = sess.run([y, z, f])
                self.assertEqual(y_val, 33.0)
                self.assertEqual(z_val, 43.0)
                self.assertEqual(f_val, 46.0)
 def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
   with self.assertRaises(TypeError):
     summary_ops.graph(ops.Graph())
   with self.assertRaises(TypeError):
     summary_ops.graph('')
    def _test_simple_increment(self,
                               task_type,
                               task_id,
                               num_gpus,
                               use_core_strategy=False):
        d, master_target, sess_config = self._get_test_objects(
            task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
        if d.extended._cluster_spec:
            num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
            if 'chief' in d.extended._cluster_spec.as_dict():
                num_workers += 1
        else:
            num_workers = 1
        with ops.Graph().as_default(), \
             self.cached_session(target=master_target,
                                 config=sess_config) as sess, \
             d.scope():

            def model_fn():
                x = variable_scope.get_variable(
                    'x',
                    initializer=10.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                y = variable_scope.get_variable(
                    'y',
                    initializer=20.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                z = variable_scope.get_variable(
                    'z',
                    initializer=30.0,
                    aggregation=variable_scope.VariableAggregation.
                    ONLY_FIRST_REPLICA)

                # We explicitly make a constant tensor here to avoid complaints about
                # summing non-distributed values.
                one = constant_op.constant(1.0)
                x_add = x.assign_add(one, use_locking=True)
                y_add = y.assign_add(one, use_locking=True)
                z_add = z.assign_add(one, use_locking=True)

                train_op = control_flow_ops.group(x_add, y_add, z_add)
                return x, y, z, train_op

            x, y, z, train_op = d.call_for_each_replica(model_fn)
            train_op = d.group(train_op)

            if context.num_gpus() < d.extended._num_gpus_per_worker:
                return True

            if task_id == 0:
                variables.global_variables_initializer().run()

            # Workers waiting for chief worker's initializing variables.
            self._init_condition.acquire()
            self._init_reached += 1
            while self._init_reached != num_workers:
                self._init_condition.wait()
            self._init_condition.notify_all()
            self._init_condition.release()

            sess.run(train_op)

            # Wait for other workers to finish training.
            self._finish_condition.acquire()
            self._finish_reached += 1
            while self._finish_reached != num_workers:
                self._finish_condition.wait()
            self._finish_condition.notify_all()
            self._finish_condition.release()

            x_val, y_val, z_val = sess.run([x, y, z])
            self.assertEqual(x_val,
                             10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
            self.assertEqual(y_val,
                             20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
            self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
            return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync
                    and y_val
                    == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync
                    and z_val == 30.0 + 1.0 * num_workers)
 def testCond(self):
     with ops.Graph().as_default():
         pred = array_ops.placeholder_with_default(True, shape=())
         x = control_flow_ops.cond(pred, lambda: constant_op.constant(1),
                                   lambda: constant_op.constant(2))
         self.assertIsNone(smart_cond.smart_constant_value(x))
Exemple #21
0
    def _testScopedExport(self, test_dir, exported_filenames):
        graph = ops.Graph()
        with graph.as_default():
            # Creates an inference graph.
            # Hidden 1
            colocate_constraint = constant_op.constant(1.2, name="constraint")
            images = constant_op.constant(1.2,
                                          dtypes.float32,
                                          shape=[100, 28],
                                          name="images")
            with ops.name_scope("hidden1"):
                with graph.colocate_with(colocate_constraint.op):
                    weights1 = variables.Variable(random_ops.truncated_normal(
                        [28, 128], stddev=1.0 / math.sqrt(float(28))),
                                                  name="weights")
                # The use of control_flow_ops.cond here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                biases1 = variables.Variable(control_flow_ops.cond(
                    math_ops.less(random.random(),
                                  0.5), lambda: array_ops.ones([128]),
                    lambda: array_ops.zeros([128])),
                                             name="biases")
                hidden1 = nn_ops.relu(
                    math_ops.matmul(images, weights1) + biases1)

            # Hidden 2
            with ops.name_scope("hidden2"):
                weights2 = variables.Variable(random_ops.truncated_normal(
                    [128, 32], stddev=1.0 / math.sqrt(float(128))),
                                              name="weights")

                # The use of control_flow_ops.while_loop here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                def loop_cond(it, _):
                    return it < 2

                def loop_body(it, biases2):
                    biases2 += constant_op.constant(0.1, shape=[32])
                    return it + 1, biases2

                _, biases2 = control_flow_ops.while_loop(
                    loop_cond, loop_body, [
                        constant_op.constant(0),
                        variables.Variable(array_ops.zeros([32]),
                                           name="biases")
                    ])
                hidden2 = nn_ops.relu(
                    math_ops.matmul(hidden1, weights2) + biases2)
            # Linear
            with ops.name_scope("softmax_linear"):
                weights3 = variables.Variable(random_ops.truncated_normal(
                    [32, 10], stddev=1.0 / math.sqrt(float(32))),
                                              name="weights")
                biases3 = variables.Variable(array_ops.zeros([10]),
                                             name="biases")
                logits = math_ops.matmul(hidden2, weights3) + biases3
                ops.add_to_collection("logits", logits)

            # Exports each sub-graph.
            # Exports the first one with unbound_inputs_col_name set to default.
            orig_meta_graph1, var_list = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[0]),
                graph=ops.get_default_graph(),
                export_scope="hidden1")
            self.assertEqual(["biases:0", "weights:0"],
                             sorted(var_list.keys()))
            var_names = [v.name for _, v in var_list.items()]
            self.assertEqual(["hidden1/biases:0", "hidden1/weights:0"],
                             sorted(var_names))

            # Exports the rest with no unbound_inputs_col_name.
            orig_meta_graph2, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[1]),
                graph=ops.get_default_graph(),
                export_scope="hidden2",
                unbound_inputs_col_name=None)
            orig_meta_graph3, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[2]),
                graph=ops.get_default_graph(),
                export_scope="softmax_linear",
                unbound_inputs_col_name=None)

        return [orig_meta_graph1, orig_meta_graph2, orig_meta_graph3]
 def testMissingArg2(self):
     with ops.Graph().as_default():
         with session.Session():
             x = constant_op.constant(1)
             with self.assertRaises(TypeError):
                 smart_cond.smart_cond(True, lambda: x)
Exemple #23
0
 def testStopGradient(self):
     with ops.Graph().as_default():
         inp = constant(1.0, shape=[100, 32], name="in")
         out = array_ops.stop_gradient(inp)
         igrad = gradients.gradients(out, inp)[0]
     assert igrad is None
Exemple #24
0
    def _setup_training(self):
        """Sets up graph, model and trainer."""
        # Create config if not given.
        if self._config is None:
            self._config = RunConfig(verbose=self.verbose)
        # Create new graph.
        self._graph = ops.Graph()
        self._graph.add_to_collection("IS_TRAINING", True)
        with self._graph.as_default():
            random_seed.set_random_seed(self._config.tf_random_seed)
            self._global_step = variables.Variable(0,
                                                   name="global_step",
                                                   trainable=False)

            # Setting up inputs and outputs.
            self._inp, self._out = self._data_feeder.input_builder()

            # If class weights are provided, add them to the graph.
            # Different loss functions can use this tensor by name.
            if self.class_weight:
                self._class_weight_node = constant_op.constant(
                    self.class_weight, name='class_weight')

            # Add histograms for X and y if they are floats.
            if self._data_feeder.input_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("X", self._inp)
            if self._data_feeder.output_dtype in (np.float32, np.float64):
                logging_ops.histogram_summary("y", self._out)

            # Create model's graph.
            self._model_predictions, self._model_loss = self.model_fn(
                self._inp, self._out)

            # Create trainer and augment graph with gradients and optimizer.
            # Additionally creates initialization ops.
            learning_rate = self.learning_rate
            optimizer = self.optimizer
            if callable(learning_rate):
                learning_rate = learning_rate(self._global_step)
            if callable(optimizer):
                optimizer = optimizer(learning_rate)
            self._train = optimizers.optimize_loss(
                self._model_loss,
                self._global_step,
                learning_rate=learning_rate,
                optimizer=optimizer,
                clip_gradients=self.clip_gradients)

            # Update ops during training, e.g. batch_norm_ops
            self._train = control_flow_ops.group(
                self._train, *ops.get_collection('update_ops'))

            # Merge all summaries into single tensor.
            self._summaries = logging_ops.merge_all_summaries()

            # Get all initializers for all trainable variables.
            self._initializers = variables.initialize_all_variables()

            # Create model's saver capturing all the nodes created up until now.
            self._saver = train.Saver(
                max_to_keep=self._config.keep_checkpoint_max,
                keep_checkpoint_every_n_hours=self._config.
                keep_checkpoint_every_n_hours)

            # Enable monitor to create validation data dict with appropriate tf placeholders
            self._monitor.create_val_feed_dict(self._inp, self._out)

            # Create session to run model with.
            self._session = session.Session(self._config.tf_master,
                                            config=self._config.tf_config)

            # Run parameter initializers.
            self._session.run(self._initializers)
Exemple #25
0
 def testGraphMode(self):
     graph = ops.Graph()
     with graph.as_default(), context.graph_mode():
         array_ops.placeholder(dtypes.int32)
     self.assertLen(graph.get_operations(), 1)
Exemple #26
0
    def _restore(self, path):
        """Restores this estimator from given path.

        Note: will rebuild the graph and initialize all parameters,
        and will ignore provided model.

        Args:
            path: Path to checkpoints and other information.
        """
        # Currently Saver requires absolute path to work correctly.
        path = os.path.abspath(path)

        self._graph = ops.Graph()
        with self._graph.as_default():
            endpoints_filename = os.path.join(path, 'endpoints')
            if not os.path.exists(endpoints_filename):
                raise ValueError("Restore folder doesn't contain endpoints.")
            with gfile.Open(endpoints_filename) as foutputs:
                endpoints = foutputs.read().split('\n')
            graph_filename = os.path.join(path, 'graph.pbtxt')
            if not os.path.exists(graph_filename):
                raise ValueError(
                    "Restore folder doesn't contain graph definition.")
            with gfile.Open(graph_filename) as fgraph:
                graph_def = graph_pb2.GraphDef()
                text_format.Merge(fgraph.read(), graph_def)
                (self._inp, self._out, self._model_predictions,
                 self._model_loss) = importer.import_graph_def(
                     graph_def, name='', return_elements=endpoints)
            saver_filename = os.path.join(path, 'saver.pbtxt')
            if not os.path.exists(saver_filename):
                raise ValueError(
                    "Restore folder doesn't contain saver definition.")
            with gfile.Open(saver_filename) as fsaver:
                saver_def = train.SaverDef()
                text_format.Merge(fsaver.read(), saver_def)
                self._saver = train.Saver(saver_def=saver_def)

            # Restore trainer
            self._global_step = self._graph.get_tensor_by_name('global_step:0')
            self._train = self._graph.get_operation_by_name('train')

            # Restore summaries.
            self._summaries = self._graph.get_operation_by_name(
                'MergeSummary/MergeSummary')

            # Restore session.
            if not isinstance(self._config, RunConfig):
                self._config = RunConfig(verbose=self.verbose)
            self._session = session.Session(self._config.tf_master,
                                            config=self._config.tf_config)
            checkpoint_path = train.latest_checkpoint(path)
            if checkpoint_path is None:
                raise ValueError(
                    "Missing checkpoint files in the %s. Please "
                    "make sure you are you have checkpoint file that describes "
                    "latest checkpoints and appropriate checkpoints are there. "
                    "If you have moved the folder, you at this point need to "
                    "update manually update the paths in the checkpoint file."
                    % path)
            self._saver.restore(self._session, checkpoint_path)
        # Set to be initialized.
        self._initialized = True
Exemple #27
0
    def _test_mixed_precision(self, task_type, task_id, num_gpus):
        """Tests mixed precision works with the CollectiveAllReduceStrategy.

    This tests:
      1. Variables are in float32, by running with a small enough learning rate
         that if the variables are float16, their values wouldn't change when
         gradients are applied.
      2. The loss scale is doubled if there are no NaNs.
      3. The loss scale is halved if the first worker has a NaN, even if the
         other works do not have NaNs.

    Args:
      task_type: A string, such as "worker", indicating the type of the replica.
      task_id: Zero-indexed ID of the task.
      num_gpus: The number of GPUs to use.
    """
        d, master_target, config = self._get_test_object(
            task_type, task_id, num_gpus)
        # Should be set to mixed_float16 by caller.
        self.assertEqual(policy.global_policy().name, 'mixed_float16')

        with ops.Graph().as_default(), \
             self.cached_session(config=config,
                                 target=master_target) as sess:
            # The loss on the first worker is multiplied by this value. Allows
            # testing the first worker having NaN loss and gradients while keeping the
            # other workers' losses and gradients finite.
            loss_multiplier_for_first_worker = variables.Variable(
                1., dtype='float16', trainable=False)
            with d.scope():
                model = keras.Sequential([
                    mp_test_util.AddLayer(assert_type=dtypes.float16,
                                          input_shape=(1, )),
                ])
                loss_scale = loss_scale_module.DynamicLossScale(
                    2**10, increment_period=1)

                def model_fn():
                    """Simple model to test mixed precision."""
                    x = np.ones((1, 1))
                    loss = model(x, training=True)

                    if ((task_type == 'worker' and task_id == 0)
                            or task_type is task_id is None):
                        loss *= loss_multiplier_for_first_worker
                    # Learning rate is small enough that if applied to a float16 variable,
                    # the variable will not change. So this tests the learning rate is not
                    # applied to a float16 value, but instead the float32 variable.
                    optimizer = gradient_descent.GradientDescentOptimizer(
                        2**-14)
                    optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
                        optimizer, loss_scale)
                    train_op = optimizer.minimize(
                        loss, training_util.get_or_create_global_step())
                    return train_op

                train_op = d.extended.call_for_each_replica(model_fn)
                train_op = d.group(d.experimental_local_results(train_op))

            sess.run(variables.global_variables_initializer())
            sess.run(train_op)

            (var, ) = model.trainable_weights
            # Variable starts at 1. Each worker's gradient is 2 ** -14, the learning
            # rate, and each worker's gradient will be subtracted from the variable.
            expected = 1 - d.num_replicas_in_sync * 2**-14
            self.assertEqual(sess.run(var), expected)
            # Loss scale should double, as are gradients are finite.
            self.assertEqual(sess.run(loss_scale()), 2**11)

            # Set the first worker to have NaN loss and gradients.
            sess.run(loss_multiplier_for_first_worker.assign(float('NaN')))
            sess.run(train_op)
            # Variable should not change, since first worker had NaN
            self.assertEqual(sess.run(var), expected)
            # Loss scale should halve due to NaN
            self.assertEqual(sess.run(loss_scale()), 2**10)
 def testConstructionNonSharded(self):
     with ops.Graph().as_default():
         p = variables.Variable(
             array_ops.zeros(shape=[100, 100], dtype=dtypes.float32))
         ids = constant_op.constant([0, 1, 1, 7], dtype=dtypes.int32)
         embedding_ops.embedding_lookup([p], ids)
Exemple #29
0
 def testNonSquare(self):
   init = init_ops.identity_initializer()
   shape = (10, 5)
   with self.session(graph=ops.Graph(), use_gpu=True):
     self.assertAllClose(init(shape).eval(), np.eye(*shape))
Exemple #30
0
    def export_savedmodel(self,
                          export_dir_base,
                          serving_input_receiver_fn,
                          assets_extra=None,
                          as_text=False,
                          checkpoint_path=None):
        """Exports inference graph as a SavedModel into given dir.

    This method builds a new graph by first calling the
    serving_input_receiver_fn to obtain feature `Tensor`s, and then calling
    this `Estimator`'s model_fn to generate the model graph based on those
    features. It restores the given checkpoint (or, lacking that, the most
    recent checkpoint) into this graph in a fresh session.  Finally it creates
    a timestamped export directory below the given export_dir_base, and writes
    a `SavedModel` into it containing a single `MetaGraphDef` saved from this
    session.

    The exported `MetaGraphDef` will provide one `SignatureDef` for each
    element of the export_outputs dict returned from the model_fn, named using
    the same keys.  One of these keys is always
    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
    signature will be served when a serving request does not specify one.
    For each signature, the outputs are provided by the corresponding
    `ExportOutput`s, and the inputs are always the input receivers provided by
    the serving_input_receiver_fn.

    Extra assets may be written into the SavedModel via the extra_assets
    argument.  This should be a dict, where each key gives a destination path
    (including the filename) relative to the assets.extra directory.  The
    corresponding value gives the full path of the source file to be copied.
    For example, the simple case of copying a single file without renaming it
    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.

    Args:
      export_dir_base: A string containing a directory in which to create
        timestamped subdirectories containing exported SavedModels.
      serving_input_receiver_fn: A function that takes no argument and
        returns a `ServingInputReceiver`.
      assets_extra: A dict specifying how to populate the assets.extra directory
        within the exported SavedModel, or `None` if no extra assets are needed.
      as_text: whether to write the SavedModel proto in text format.
      checkpoint_path: The checkpoint path to export.  If `None` (the default),
        the most recent checkpoint found within the model directory is chosen.

    Returns:
      The string path to the exported directory.

    Raises:
      ValueError: if no serving_input_receiver_fn is provided, no export_outputs
          are provided, or no checkpoint can be found.
    """
        if serving_input_receiver_fn is None:
            raise ValueError('serving_input_receiver_fn must be defined.')

        with ops.Graph().as_default() as g:
            self._create_and_assert_global_step(g)
            random_seed.set_random_seed(self._config.tf_random_seed)
            serving_input_receiver = serving_input_receiver_fn()

            # Call the model_fn and collect the export_outputs.
            estimator_spec = self._call_model_fn(
                features=serving_input_receiver.features,
                labels=None,
                mode=model_fn_lib.ModeKeys.PREDICT,
                config=self.config)

            # Build the SignatureDefs from receivers and all outputs
            signature_def_map = build_all_signature_defs(
                serving_input_receiver.receiver_tensors,
                estimator_spec.export_outputs,
                serving_input_receiver.receiver_tensors_alternatives)

            if not checkpoint_path:
                # Locate the latest checkpoint
                checkpoint_path = saver.latest_checkpoint(self._model_dir)
            if not checkpoint_path:
                raise ValueError("Couldn't find trained model at %s." %
                                 self._model_dir)

            export_dir = get_timestamped_export_dir(export_dir_base)
            temp_export_dir = get_temp_export_dir(export_dir)

            # TODO(soergel): Consider whether MonitoredSession makes sense here
            with tf_session.Session() as session:

                saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
                    sharded=True)
                saver_for_restore.restore(session, checkpoint_path)

                # TODO(b/36111876): replace legacy_init_op with main_op mechanism
                # pylint: disable=protected-access
                local_init_op = (
                    estimator_spec.scaffold.local_init_op
                    or monitored_session.Scaffold._default_local_init_op())
                # pylint: enable=protected-access

                # Perform the export
                builder = saved_model_builder.SavedModelBuilder(
                    temp_export_dir)
                builder.add_meta_graph_and_variables(
                    session, [tag_constants.SERVING],
                    signature_def_map=signature_def_map,
                    assets_collection=ops.get_collection(
                        ops.GraphKeys.ASSET_FILEPATHS),
                    legacy_init_op=local_init_op)
                builder.save(as_text)

            # Add the extra assets
            if assets_extra:
                assets_extra_path = os.path.join(
                    compat.as_bytes(temp_export_dir),
                    compat.as_bytes('assets.extra'))
                for dest_relative, source in assets_extra.items():
                    dest_absolute = os.path.join(
                        compat.as_bytes(assets_extra_path),
                        compat.as_bytes(dest_relative))
                    dest_path = os.path.dirname(dest_absolute)
                    gfile.MakeDirs(dest_path)
                    gfile.Copy(source, dest_absolute)

            gfile.Rename(temp_export_dir, export_dir)
            return export_dir