Exemple #1
0
class Model(object):
    """
    Base class for all (TensorFlow-based) models.
    """
    def __init__(self, states_spec, actions_spec, device, session_config,
                 scope, saver_spec, summary_spec, distributed_spec, optimizer,
                 discount, normalize_rewards, variable_noise, **kwargs):
        # States and actions specifications
        self.states_spec = states_spec
        self.actions_spec = actions_spec

        # TensorFlow device and scope
        self.device = device
        self.session_config = session_config
        self.scope = scope

        # Saver/summary/distributed specifications
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec

        # TensorFlow summaries
        if summary_spec is None:
            self.summary_labels = set()
        else:
            self.summary_labels = set(summary_spec.get('labels', ()))

        # Optimizer
        self.optimizer = optimizer

        # Discount factor
        self.discount = discount

        # Reward normalization
        assert isinstance(normalize_rewards, bool)
        self.normalize_rewards = normalize_rewards

        # Variable noise
        assert variable_noise is None or variable_noise > 0.0
        self.variable_noise = variable_noise

        # Setup TensorFlow graph and session
        self.setup()

    def setup(self):
        """
        Sets up the TensorFlow model graph and initializes the TensorFlow session.
        """
        default_graph = None
        if self.distributed_spec is None:
            self.global_model = None
            self.graph = tf.Graph()
            default_graph = self.graph.as_default()
            default_graph.__enter__()

        elif self.distributed_spec.get('parameter_server'):
            if self.distributed_spec.get('replica_model'):
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.global_model = None
            self.graph = tf.Graph()
            default_graph = self.graph.as_default()
            default_graph.__enter__()

        elif self.distributed_spec.get('replica_model'):
            self.device = tf.train.replica_device_setter(
                worker_device=self.device,
                cluster=self.distributed_spec['cluster_spec'])
            self.global_model = None
            # Replica model is part of its parent model's graph, hence no new graph here.
            self.graph = tf.get_default_graph()

        else:
            graph = tf.Graph()
            default_graph = graph.as_default()
            default_graph.__enter__()
            # Global model.
            self.global_model = deepcopy(self)
            self.global_model.distributed_spec['replica_model'] = True
            self.global_model.setup()
            self.graph = graph

        with tf.device(device_name_or_function=self.device):
            # Episode
            collection = self.graph.get_collection(name='episode')
            if len(collection) == 0:
                self.episode = tf.get_variable(name='episode',
                                               dtype=tf.int32,
                                               initializer=0,
                                               trainable=False)
                self.graph.add_to_collection(name='episode',
                                             value=self.episode)
            else:
                assert len(collection) == 1
                self.episode = collection[0]

            # Timestep
            collection = self.graph.get_collection(name='timestep')
            if len(collection) == 0:
                self.timestep = tf.get_variable(name='timestep',
                                                dtype=tf.int32,
                                                initializer=0,
                                                trainable=False)
                self.graph.add_to_collection(name='timestep',
                                             value=self.timestep)
                self.graph.add_to_collection(name=tf.GraphKeys.GLOBAL_STEP,
                                             value=self.timestep)
            else:
                assert len(collection) == 1
                self.timestep = collection[0]

            # Variables and summaries
            self.variables = dict()
            self.all_variables = dict()
            self.registered_variables = set()
            self.summaries = list()

            def custom_getter(getter,
                              name,
                              registered=False,
                              second=False,
                              **kwargs):
                if registered:
                    self.registered_variables.add(name)
                elif name in self.registered_variables:
                    registered = True
                variable = getter(name=name,
                                  **kwargs)  # Top-level, hence no 'registered'
                if not registered:
                    self.all_variables[name] = variable
                    if kwargs.get(
                            'trainable',
                            True) and not name.startswith('optimization'):
                        self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name,
                                                           values=variable)
                            self.summaries.append(summary)
                return variable

            # Create placeholders, tf functions, internals, etc
            self.initialize(custom_getter=custom_getter)

            # Input tensors
            states = self.get_states(states=self.state_inputs)
            internals = [
                tf.identity(input=internal)
                for internal in self.internal_inputs
            ]
            actions = self.get_actions(actions=self.action_inputs)
            terminal = tf.identity(input=self.terminal_input)
            reward = self.get_reward(states=states,
                                     internals=internals,
                                     terminal=terminal,
                                     reward=self.reward_input)

            # Stop gradients for input preprocessing
            states = {
                name: tf.stop_gradient(input=state)
                for name, state in states.items()
            }
            actions = {
                name: tf.stop_gradient(input=action)
                for name, action in actions.items()
            }
            reward = tf.stop_gradient(input=reward)

            # Optimizer
            if self.optimizer is None:
                pass
            elif self.distributed_spec is not None and \
                    not self.distributed_spec.get('parameter_server') and \
                    not self.distributed_spec.get('replica_model'):
                # If not internal global model
                self.optimizer = GlobalOptimizer(optimizer=self.optimizer)
            else:
                self.optimizer = Optimizer.from_spec(spec=self.optimizer)

            # Create output fetch operations
            self.create_output_operations(
                states=states,
                internals=internals,
                actions=actions,
                terminal=terminal,
                reward=reward,
                update=self.update_input,
                deterministic=self.deterministic_input)

            if 'inputs' in self.summary_labels:
                for name, state in states.items():
                    summary = tf.summary.histogram(
                        name=(self.scope + '/inputs/states/' + name),
                        values=state)
                    self.summaries.append(summary)
                for name, action in actions.items():
                    summary = tf.summary.histogram(
                        name=(self.scope + '/inputs/actions/' + name),
                        values=action)
                    self.summaries.append(summary)
                summary = tf.summary.histogram(name=(self.scope +
                                                     '/inputs/reward'),
                                               values=reward)
                self.summaries.append(summary)

        if self.distributed_spec is not None:
            if self.distributed_spec.get('replica_model'):
                # If internal global model
                return

            elif self.distributed_spec.get('parameter_server'):
                server = tf.train.Server(
                    server_or_cluster_def=self.
                    distributed_spec['cluster_spec'],
                    job_name='ps',
                    task_index=self.distributed_spec['task_index'],
                    protocol=self.distributed_spec.get('protocol'),
                    config=None,
                    start=True)
                # Param server does nothing actively
                server.join()
                return

        # Global and local variables initialize operations
        if self.distributed_spec is None:
            global_variables = self.get_variables(include_non_trainable=True)
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(
                var_list=global_variables)
            ready_for_local_init_op = None
            local_init_op = None
        else:
            global_variables = self.global_model.get_variables(
                include_non_trainable=True)
            local_variables = self.get_variables(include_non_trainable=True)
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(
                var_list=(global_variables + local_variables))
            ready_for_local_init_op = tf.report_uninitialized_variables(
                var_list=global_variables)
            local_init_op = tf.group(*(local_var.assign(value=global_var)
                                       for local_var, global_var in zip(
                                           local_variables, global_variables)))

        def init_fn(scaffold, session):
            if self.saver_spec is not None and self.saver_spec.get(
                    'load', True):
                directory = self.saver_spec['directory']
                file = self.saver_spec.get('file')
                if file is None:
                    file = tf.train.latest_checkpoint(
                        checkpoint_dir=directory,
                        latest_filename=
                        None  # Corresponds to argument of saver.save() in Model.save().
                    )
                elif not os.path.isfile(file):
                    file = os.path.join(directory, file)
                if file is not None:
                    scaffold.saver.restore(sess=session, save_path=file)

        # Summary operation
        summaries = self.get_summaries()
        if len(summaries) > 0:
            summary_op = tf.summary.merge(inputs=summaries)
        else:
            summary_op = None

        # TensorFlow saver object
        saver = tf.train.Saver(
            var_list=global_variables,  # should be given?
            reshape=False,
            sharded=False,  # should be true?
            max_to_keep=5,
            keep_checkpoint_every_n_hours=10000.0,
            name=None,
            restore_sequentially=False,
            saver_def=None,
            builder=None,
            defer_build=False,
            allow_empty=True,
            write_version=tf.train.SaverDef.V2,
            pad_step_number=False,
            save_relative_paths=True
            #filename=None
        )

        # TensorFlow scaffold object
        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_feed_dict=None,
            init_fn=init_fn,
            ready_op=ready_op,
            ready_for_local_init_op=ready_for_local_init_op,
            local_init_op=local_init_op,
            summary_op=summary_op,
            saver=saver,
            copy_from_scaffold=None)

        hooks = list()

        # Checkpoint saver hook
        if self.saver_spec is not None and (
                self.distributed_spec is None
                or self.distributed_spec['task_index'] == 0):
            self.saver_directory = self.saver_spec['directory']
            hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir=self.saver_directory,
                    save_secs=self.saver_spec.get(
                        'seconds',
                        None if 'steps' in self.saver_spec else 600),
                    save_steps=self.saver_spec.get(
                        'steps'),  # Either one or the other has to be set.
                    saver=None,  # None since given via 'scaffold' argument.
                    checkpoint_basename=self.saver_spec.get(
                        'basename', 'model.ckpt'),
                    scaffold=self.scaffold,
                    listeners=None))
        else:
            self.saver_directory = None

        # Summary saver hook
        if self.summary_spec is None:
            self.summary_writer_hook = None
        else:
            # TensorFlow summary writer object
            summary_writer = tf.summary.FileWriter(
                logdir=self.summary_spec['directory'],
                graph=self.graph,
                max_queue=10,
                flush_secs=120,
                filename_suffix=None)
            self.summary_writer_hook = util.UpdateSummarySaverHook(
                update_input=self.update_input,
                save_steps=self.summary_spec.get(
                    'steps'),  # Either one or the other has to be set.
                save_secs=self.summary_spec.get(
                    'seconds', None if 'steps' in self.summary_spec else 120),
                output_dir=
                None,  # None since given via 'summary_writer' argument.
                summary_writer=summary_writer,
                scaffold=self.scaffold,
                summary_op=None  # None since given via 'scaffold' argument.
            )
            hooks.append(self.summary_writer_hook)

        # Stop at step hook
        # hooks.append(tf.train.StopAtStepHook(
        #     num_steps=???,  # This makes more sense, if load and continue training.
        #     last_step=None  # Either one or the other has to be set.
        # ))

        # # Step counter hook
        # hooks.append(tf.train.StepCounterHook(
        #     every_n_steps=counter_config.get('steps', 100),  # Either one or the other has to be set.
        #     every_n_secs=counter_config.get('secs'),  # Either one or the other has to be set.
        #     output_dir=None,  # None since given via 'summary_writer' argument.
        #     summary_writer=summary_writer
        # ))

        # Other available hooks:
        # tf.train.FinalOpsHook(final_ops, final_ops_feed_dict=None)
        # tf.train.GlobalStepWaiterHook(wait_until_step)
        # tf.train.LoggingTensorHook(tensors, every_n_iter=None, every_n_secs=None)
        # tf.train.NanTensorHook(loss_tensor, fail_on_nan_loss=True)
        # tf.train.ProfilerHook(save_steps=None, save_secs=None, output_dir='', show_dataflow=True, show_memory=False)

        if self.distributed_spec is None:
            # TensorFlow non-distributed monitored session object
            self.monitored_session = tf.train.SingularMonitoredSession(
                hooks=hooks,
                scaffold=self.scaffold,
                master='',  # Default value.
                config=self.session_config,  # always the same?
                checkpoint_dir=None)

        else:
            server = tf.train.Server(
                server_or_cluster_def=self.distributed_spec['cluster_spec'],
                job_name='worker',
                task_index=self.distributed_spec['task_index'],
                protocol=self.distributed_spec.get('protocol'),
                config=self.session_config,
                start=True)

            if self.distributed_spec['task_index'] == 0:
                # TensorFlow chief session creator object
                session_creator = tf.train.ChiefSessionCreator(
                    scaffold=self.scaffold,
                    master=server.target,
                    config=self.session_config,
                    checkpoint_dir=None,
                    checkpoint_filename_with_path=None)
            else:
                # TensorFlow worker session creator object
                session_creator = tf.train.WorkerSessionCreator(
                    scaffold=self.scaffold,
                    master=server.target,
                    config=self.session_config,
                )

            # TensorFlow monitored session object
            self.monitored_session = tf.train.MonitoredSession(
                session_creator=session_creator,
                hooks=hooks,
                stop_grace_period_secs=120  # Default value.
            )

        if default_graph:
            default_graph.__exit__(None, None, None)
        self.graph.finalize()
        self.monitored_session.__enter__()
        self.session = self.monitored_session._tf_sess()

        # # tf.ConfigProto(device_filters=['/job:ps', '/job:worker/task:{}/cpu:0'.format(self.task_index)])
        #         # config=tf.ConfigProto(device_filters=["/job:ps"])
        #         # config=tf.ConfigProto(
        #         #     inter_op_parallelism_threads=2,
        #         #     log_device_placement=True
        #         # )

    def close(self):
        if self.saver_directory is not None:
            self.save(append_timestep=True)
        self.monitored_session.close()

    def initialize(self, custom_getter):
        """
        Creates the TensorFlow placeholders and functions for this model. Moreover adds the internal state
        placeholders and initialization values to the model.

        Args:
            custom_getter: The `custom_getter_` object to use for `tf.make_template` when creating TensorFlow functions.
        """

        # States
        self.state_inputs = dict()
        for name, state in self.states_spec.items():
            self.state_inputs[name] = tf.placeholder(
                dtype=util.tf_dtype(state['type']),
                shape=(None, ) + tuple(state['shape']),
                name=name)

        # Actions
        self.action_inputs = dict()
        for name, action in self.actions_spec.items():
            self.action_inputs[name] = tf.placeholder(
                dtype=util.tf_dtype(action['type']),
                shape=(None, ) + tuple(action['shape']),
                name=name)

        # Terminal
        self.terminal_input = tf.placeholder(dtype=tf.bool,
                                             shape=(None, ),
                                             name='terminal')

        # Reward
        self.reward_input = tf.placeholder(dtype=tf.float32,
                                           shape=(None, ),
                                           name='reward')

        # Internal states
        self.internal_inputs = list()
        self.internal_inits = list()

        # Deterministic action flag
        self.deterministic_input = tf.placeholder(dtype=tf.bool,
                                                  shape=(),
                                                  name='deterministic')

        # Update flag
        self.update_input = tf.placeholder(dtype=tf.bool,
                                           shape=(),
                                           name='update')

        # TensorFlow functions
        self.fn_discounted_cumulative_reward = tf.make_template(
            name_=(self.scope + '/discounted-cumulative-reward'),
            func_=self.tf_discounted_cumulative_reward,
            custom_getter_=custom_getter)
        self.fn_actions_and_internals = tf.make_template(
            name_=(self.scope + '/actions-and-internals'),
            func_=self.tf_actions_and_internals,
            custom_getter_=custom_getter)
        self.fn_loss_per_instance = tf.make_template(
            name_=(self.scope + '/loss-per-instance'),
            func_=self.tf_loss_per_instance,
            custom_getter_=custom_getter)
        self.fn_regularization_losses = tf.make_template(
            name_=(self.scope + '/regularization-losses'),
            func_=self.tf_regularization_losses,
            custom_getter_=custom_getter)
        self.fn_loss = tf.make_template(name_=(self.scope + '/loss'),
                                        func_=self.tf_loss,
                                        custom_getter_=custom_getter)
        self.fn_optimization = tf.make_template(name_=(self.scope +
                                                       '/optimization'),
                                                func_=self.tf_optimization,
                                                custom_getter_=custom_getter)
        # self.fn_summarization = tf.make_template(
        #     name_='summarization',
        #     func_=self.tf_summarization,
        #     custom_getter_=custom_getter
        # )

    def get_states(self, states):
        # TODO: preprocessing could go here?
        return {
            name: tf.identity(input=state)
            for name, state in states.items()
        }

    def get_actions(self, actions):
        # TODO: preprocessing could go here?
        return {
            name: tf.identity(input=action)
            for name, action in actions.items()
        }

    def get_reward(self, states, internals, terminal, reward):
        if self.normalize_rewards:
            mean, variance = tf.nn.moments(x=reward, axes=0)
            return (reward - mean) / tf.maximum(x=variance, y=util.epsilon)
        else:
            return tf.identity(input=reward)

    def tf_discounted_cumulative_reward(self,
                                        terminal,
                                        reward,
                                        discount,
                                        final_reward=0.0):
        """
        Creates the TensorFlow operations for calculating the discounted cumulative rewards
        for a given sequence of rewards.

        Args:
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            discount: Discount factor.
            final_reward: Last reward value in the sequence.

        Returns:
            Discounted cumulative reward tensor.
        """

        # TODO: n-step cumulative reward (particularly for envs without terminal)

        def cumulate(cumulative, reward_and_terminal):
            rew, term = reward_and_terminal
            return tf.where(condition=term,
                            x=rew,
                            y=(rew + cumulative * discount))

        # Reverse since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right
        reward = tf.reverse(tensor=reward, axis=(0, ))
        terminal = tf.reverse(tensor=terminal, axis=(0, ))

        reward = tf.scan(fn=cumulate,
                         elems=(reward, terminal),
                         initializer=final_reward)

        return tf.reverse(tensor=reward, axis=(0, ))

    def tf_actions_and_internals(self, states, internals, update,
                                 deterministic):
        """
        Creates the TensorFlow operations for retrieving the actions (and posterior internal states)
        in reaction to the given input states (and prior internal states).

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            update: Boolean tensor indicating whether this call happens during an update.
            deterministic: Boolean tensor indicating whether action should be chosen  
                deterministically.

        Returns:
            Actions and list of posterior internal state tensors.
        """
        raise NotImplementedError

    def tf_loss_per_instance(self, states, internals, actions, terminal,
                             reward, update):
        """
        Creates the TensorFlow operations for calculating the loss per batch instance
        of the given input states and actions.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Loss tensor.
        """
        raise NotImplementedError

    def tf_regularization_losses(self, states, internals, update):
        """
        Creates the TensorFlow operations for calculating the regularization losses for the given input states.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Dict of regularization loss tensors.
        """
        return dict()

    def tf_loss(self, states, internals, actions, terminal, reward, update):
        # Mean loss per instance
        loss_per_instance = self.fn_loss_per_instance(states=states,
                                                      internals=internals,
                                                      actions=actions,
                                                      terminal=terminal,
                                                      reward=reward,
                                                      update=update)
        loss = tf.reduce_mean(input_tensor=loss_per_instance, axis=0)

        # Loss without regularization summary
        if 'losses' in self.summary_labels:
            summary = tf.summary.scalar(name='loss-without-regularization',
                                        tensor=loss)
            self.summaries.append(summary)

        # Regularization losses
        losses = self.fn_regularization_losses(states=states,
                                               internals=internals,
                                               update=update)
        if len(losses) > 0:
            loss += tf.add_n(inputs=list(losses.values()))
            if 'regularization' in self.summary_labels:
                for name, loss_val in losses.items():
                    summary = tf.summary.scalar(name="regularization/" + name,
                                                tensor=loss_val)
                    self.summaries.append(summary)

        # Total loss summary
        if 'losses' in self.summary_labels or 'total-loss' in self.summary_labels:
            summary = tf.summary.scalar(name='total-loss', tensor=loss)
            self.summaries.append(summary)

        return loss

    def get_optimizer_kwargs(self, states, internals, actions, terminal,
                             reward, update):
        """
        Returns the optimizer arguments including the time, the list of variables to optimize,
        and various argument-free functions (in particular `fn_loss` returning the combined
        0-dim batch loss tensor) which the optimizer might require to perform an update step.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Loss tensor of the size of the batch.
        """
        kwargs = dict()
        kwargs['time'] = self.timestep
        kwargs['variables'] = self.get_variables()
        kwargs['fn_loss'] = (lambda: self.fn_loss(states=states,
                                                  internals=internals,
                                                  actions=actions,
                                                  terminal=terminal,
                                                  reward=reward,
                                                  update=update))
        if self.global_model is not None:
            kwargs['global_variables'] = self.global_model.get_variables()
        return kwargs

    def tf_optimization(self, states, internals, actions, terminal, reward,
                        update):
        """
        Creates the TensorFlow operations for performing an optimization update step based
        on the given input states and actions batch.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            The optimization operation.
        """
        if self.optimizer is None:
            return tf.no_op()
        else:
            optimizer_kwargs = self.get_optimizer_kwargs(states=states,
                                                         internals=internals,
                                                         actions=actions,
                                                         terminal=terminal,
                                                         reward=reward,
                                                         update=update)
            return self.optimizer.minimize(**optimizer_kwargs)

    def create_output_operations(self, states, internals, actions, terminal,
                                 reward, update, deterministic):
        """
        Calls all the relevant TensorFlow functions for this model and hence creates all the
        TensorFlow operations involved.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.
            deterministic: Boolean tensor indicating whether action should be chosen  
                deterministically.
        """

        # Create graph by calling the functions corresponding to model.act() / model.update(), to initialize variables.
        # TODO: Could call reset here, but would have to move other methods below reset.
        self.fn_actions_and_internals(states=states,
                                      internals=internals,
                                      update=update,
                                      deterministic=deterministic)
        self.fn_loss_per_instance(states=states,
                                  internals=internals,
                                  actions=actions,
                                  terminal=terminal,
                                  reward=reward,
                                  update=update)

        # Tensor fetched for model.act()
        operations = list()
        if self.variable_noise is not None and self.variable_noise > 0.0:
            # Add variable noise
            noise_deltas = list()
            for variable in self.get_variables():
                noise_delta = tf.random_normal(shape=util.shape(variable),
                                               mean=0.0,
                                               stddev=self.variable_noise)
                noise_deltas.append(noise_delta)
                operations.append(variable.assign_add(delta=noise_delta))

        # Retrieve actions and internals
        with tf.control_dependencies(control_inputs=operations):
            self.actions_internals_timestep = self.fn_actions_and_internals(
                states=states,
                internals=internals,
                update=update,
                deterministic=deterministic)

        # Increment timestep
        increment_timestep = tf.shape(input=next(iter(states.values())))[0]
        increment_timestep = self.timestep.assign_add(delta=increment_timestep)
        operations = [increment_timestep]

        # Subtract variable noise
        if self.variable_noise is not None and self.variable_noise > 0.0:
            for variable, noise_delta in zip(self.get_variables(),
                                             noise_deltas):
                operations.append(variable.assign_sub(delta=noise_delta))

        with tf.control_dependencies(control_inputs=operations):
            # Trivial operation to enforce control dependency
            self.actions_internals_timestep += (self.timestep + 0, )

        # Tensor fetched for model.observe()
        increment_episode = self.episode.assign_add(
            delta=tf.count_nonzero(input_tensor=terminal, dtype=tf.int32))
        with tf.control_dependencies(control_inputs=(increment_episode, )):
            self.increment_episode = self.episode + 0
        # TODO: add up rewards per episode and add summary_label 'episode-reward'

        # Tensor(s) fetched for model.update()
        self.optimization = self.fn_optimization(states=states,
                                                 internals=internals,
                                                 actions=actions,
                                                 terminal=terminal,
                                                 reward=reward,
                                                 update=update)
        self.loss_per_instance = self.fn_loss_per_instance(states=states,
                                                           internals=internals,
                                                           actions=actions,
                                                           terminal=terminal,
                                                           reward=reward,
                                                           update=update)

    def get_variables(self, include_non_trainable=False):
        """
        Returns the TensorFlow variables used by the model.

        Returns:
            List of variables.
        """

        if include_non_trainable:
            # optimizer variables and timestep/episode only included if 'include_non_trainable' set
            model_variables = [
                self.all_variables[key] for key in sorted(self.all_variables)
            ]

            if self.optimizer is None:
                return model_variables + [self.timestep, self.episode]

            else:
                optimizer_variables = self.optimizer.get_variables()
                return model_variables + optimizer_variables + [
                    self.timestep, self.episode
                ]

        else:
            return [self.variables[key] for key in sorted(self.variables)]

    def get_summaries(self):
        """
        Returns the TensorFlow summaries reported by the model

        Returns:
            List of summaries
        """
        return self.summaries

    def reset(self):
        """
        Resets the model to its initial state on episode start.

        Returns:
            Current episode and timestep counter, and a list containing the internal states  
            initializations.
        """
        episode, timestep = self.monitored_session.run(fetches=(self.episode,
                                                                self.timestep))
        return episode, timestep, list(self.internal_inits)

    def act(self, states, internals, deterministic=False):
        fetches = list(self.actions_internals_timestep)

        name = next(iter(self.states_spec))
        batched = (np.asarray(states[name]).ndim != len(
            self.states_spec[name]['shape']))
        if batched:
            feed_dict = {
                state_input: states[name]
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: internals[n]
                for n, internal_input in enumerate(self.internal_inputs)
            })
        else:
            feed_dict = {
                state_input: (states[name], )
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: (internals[n], )
                for n, internal_input in enumerate(self.internal_inputs)
            })

        feed_dict[self.deterministic_input] = deterministic
        feed_dict[self.update_input] = False

        actions, internals, timestep = self.monitored_session.run(
            fetches=fetches, feed_dict=feed_dict)

        if not batched:
            actions = {name: action[0] for name, action in actions.items()}
            internals = [internal[0] for internal in internals]

        return actions, internals, timestep

    def observe(self, terminal, reward):
        fetches = self.increment_episode

        terminal = np.asarray(terminal)
        batched = (terminal.ndim == 1)
        if batched:
            feed_dict = {
                self.terminal_input: terminal,
                self.reward_input: reward,
            }
        else:
            feed_dict = {
                self.terminal_input: (terminal, ),
                self.reward_input: (reward, )
            }

        feed_dict[self.update_input] = False

        episode = self.monitored_session.run(fetches=fetches,
                                             feed_dict=feed_dict)

        return episode

    def update(self,
               states,
               internals,
               actions,
               terminal,
               reward,
               return_loss_per_instance=False):
        fetches = [self.optimization]

        # Optionally fetch loss per instance
        if return_loss_per_instance:
            fetches.append(self.loss_per_instance)

        terminal = np.asarray(terminal)
        batched = (terminal.ndim == 1)
        if batched:
            feed_dict = {
                state_input: states[name]
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: internals[n]
                for n, internal_input in enumerate(self.internal_inputs)
            })
            feed_dict.update({
                action_input: actions[name]
                for name, action_input in self.action_inputs.items()
            })
            feed_dict[self.terminal_input] = terminal
            feed_dict[self.reward_input] = reward
        else:
            feed_dict = {
                state_input: (states[name], )
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: (internals[n], )
                for n, internal_input in enumerate(self.internal_inputs)
            })
            feed_dict.update({
                action_input: (actions[name], )
                for name, action_input in self.action_inputs.items()
            })
            feed_dict[self.terminal_input] = (terminal, )
            feed_dict[self.reward_input] = (reward, )

        feed_dict[self.deterministic_input] = True
        feed_dict[self.update_input] = True

        fetched = self.monitored_session.run(fetches=fetches,
                                             feed_dict=feed_dict)

        if return_loss_per_instance:
            return fetched[1]

    def save(self, directory=None, append_timestep=True):
        """
        Save TensorFlow model. If no checkpoint directory is given, the model's default saver  
        directory is used. Optionally appends current timestep to prevent overwriting previous  
        checkpoint files. Turn off to be able to load model from the same given path argument as  
        given here.

        Args:
            directory: Optional checkpoint directory.
            append_timestep: Appends the current timestep to the checkpoint file if true.

        Returns:
            Checkpoint path were the model was saved.
        """
        if self.summary_writer_hook is not None:
            self.summary_writer_hook._summary_writer.flush()

        return self.scaffold.saver.save(
            sess=self.session,
            save_path=(self.saver_directory
                       if directory is None else directory),
            global_step=(self.timestep if append_timestep else None),
            # latest_filename=None,  # Defaults to 'checkpoint'.
            meta_graph_suffix='meta',
            write_meta_graph=True,
            write_state=True)

    def restore(self, directory=None, file=None):
        """
        Restore TensorFlow model. If no checkpoint file is given, the latest checkpoint is  
        restored. If no checkpoint directory is given, the model's default saver directory is  
        used (unless file specifies the entire path).

        Args:
            directory: Optional checkpoint directory.
            file: Optional checkpoint file, or path if directory not given.
        """
        if file is None:
            file = tf.train.latest_checkpoint(
                checkpoint_dir=(self.saver_directory
                                if directory is None else directory),
                # latest_filename=None  # Corresponds to argument of saver.save() in Model.save().
            )
        elif directory is None:
            file = os.path.join(self.saver_directory, file)
        elif not os.path.isfile(file):
            file = os.path.join(directory, file)

        # if not os.path.isfile(file):
        #     raise TensorForceError("Invalid model directory/file.")

        self.scaffold.saver.restore(sess=self.session, save_path=file)
Exemple #2
0
    def __init__(self, states_spec, actions_spec, config, **kwargs):

        # States and actions specifications
        self.states_spec = states_spec
        self.actions_spec = actions_spec

        # Discount factor
        self.discount = config.discount

        # Reward normalization
        assert isinstance(config.normalize_rewards, bool)
        self.normalize_rewards = config.normalize_rewards

        # Variable noise
        assert config.variable_noise is None or config.variable_noise > 0.0
        self.variable_noise = config.variable_noise

        # TensorFlow summaries
        self.summary_labels = set(config.summary_labels or ())

        # Variables and summaries
        self.variables = dict()
        self.all_variables = dict()
        self.summaries = list()

        if not config.local_model or not config.replica_model:
            # If not local_model mode or not internal global model
            self.default_graph = tf.Graph().as_default()
            self.graph = self.default_graph.__enter__()

        if config.cluster_spec is None:
            if config.parameter_server or config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.parameter_server:
            if config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.replica_model:
            self.device = tf.train.replica_device_setter(
                worker_device=config.device, cluster=config.cluster_spec)
            self.global_model = None

        elif config.local_model:
            if config.replica_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device

            global_config = config.copy()
            global_config.set(key='replica_model', value=True)

            self.global_model = self.__class__(states_spec=states_spec,
                                               actions_spec=actions_spec,
                                               config=global_config,
                                               **kwargs)

        else:
            raise TensorForceError(
                "Invalid config value for distributed mode.")

        with tf.device(device_name_or_function=self.device):

            # Timestep and episode
            # TODO: various modes !!!
            if self.global_model is None:
                # TODO: Variables seem to re-initialize in the beginning every time a runner starts
                self.timestep = tf.get_variable(name='timestep',
                                                dtype=tf.int32,
                                                initializer=0,
                                                trainable=False)
                self.episode = tf.get_variable(name='episode',
                                               dtype=tf.int32,
                                               initializer=0,
                                               trainable=False)
            else:
                self.timestep = self.global_model.timestep
                self.episode = self.global_model.episode

            with tf.name_scope(name=config.scope):

                def custom_getter(getter, name, registered=False, **kwargs):
                    variable = getter(
                        name=name,
                        **kwargs)  # Top-level, hence no 'registered'
                    if not registered:
                        self.all_variables[name] = variable
                        if kwargs.get(
                                'trainable',
                                True) and not name.startswith('optimization'):
                            self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name,
                                                           values=variable)
                            self.summaries.append(summary)
                    return variable

                # Create placeholders, tf functions, internals, etc
                self.initialize(custom_getter=custom_getter)

                # Input tensors
                states = self.get_states(states=self.state_inputs)
                internals = [
                    tf.identity(input=internal)
                    for internal in self.internal_inputs
                ]
                actions = self.get_actions(actions=self.action_inputs)
                terminal = tf.identity(input=self.terminal_input)
                reward = self.get_reward(states=states,
                                         internals=internals,
                                         terminal=terminal,
                                         reward=self.reward_input)

                # Stop gradients for input preprocessing
                states = {
                    name: tf.stop_gradient(input=state)
                    for name, state in states.items()
                }
                actions = {
                    name: tf.stop_gradient(input=action)
                    for name, action in actions.items()
                }
                reward = tf.stop_gradient(input=reward)

                # Optimizer
                if config.optimizer is None:
                    self.optimizer = None
                elif config.local_model and not config.replica_model:
                    # If local_model mode and not internal global model
                    self.optimizer = GlobalOptimizer(
                        optimizer=config.optimizer)
                else:
                    self.optimizer = Optimizer.from_spec(spec=config.optimizer)

                # Create output fetch operations
                self.create_output_operations(states=states,
                                              internals=internals,
                                              actions=actions,
                                              terminal=terminal,
                                              reward=reward,
                                              deterministic=self.deterministic)

        if config.local_model and config.replica_model:
            # If local_model mode and internal global model
            return

        # Local and global initialize operations
        if config.local_model:
            init_op = tf.variables_initializer(
                var_list=self.global_model.get_variables(
                    include_non_trainable=True))
            local_init_op = tf.variables_initializer(
                var_list=self.get_variables(include_non_trainable=True))

        else:
            init_op = tf.variables_initializer(var_list=self.get_variables(
                include_non_trainable=True))
            local_init_op = None

        # Summary operation
        if len(self.get_summaries()) > 0:
            summary_op = tf.summary.merge(inputs=self.get_summaries())
        else:
            summary_op = None

        # TODO: MonitoredSession or so?
        self.supervisor = tf.train.Supervisor(
            is_chief=(config.task_index == 0),
            init_op=init_op,
            local_init_op=local_init_op,
            logdir=config.model_directory,
            summary_op=summary_op,
            global_step=self.timestep,
            save_summaries_secs=config.summary_frequency,
            save_model_secs=config.save_frequency
            # checkpoint_basename='model.ckpt'
            # session_manager=None
        )

        # tf.ConfigProto(device_filters=['/job:ps', '/job:worker/task:{}/cpu:0'.format(self.task_index)])
        if config.parameter_server:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='ps',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            # Param server does nothing actively
            self.server.join()

        elif config.cluster_spec is not None:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='worker',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            self.managed_session = self.supervisor.managed_session(
                master=self.server.target,
                start_standard_services=(config.model_directory is not None))
            self.session = self.managed_session.__enter__()

        else:
            self.managed_session = self.supervisor.managed_session(
                start_standard_services=(config.model_directory is not None))
            self.session = self.managed_session.__enter__()
Exemple #3
0
    def setup(self):
        """
        Sets up the TensorFlow model graph and initializes the TensorFlow session.
        """
        default_graph = None
        if self.distributed_spec is None:
            self.global_model = None
            self.graph = tf.Graph()
            default_graph = self.graph.as_default()
            default_graph.__enter__()

        elif self.distributed_spec.get('parameter_server'):
            if self.distributed_spec.get('replica_model'):
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.global_model = None
            self.graph = tf.Graph()
            default_graph = self.graph.as_default()
            default_graph.__enter__()

        elif self.distributed_spec.get('replica_model'):
            self.device = tf.train.replica_device_setter(
                worker_device=self.device,
                cluster=self.distributed_spec['cluster_spec'])
            self.global_model = None
            # Replica model is part of its parent model's graph, hence no new graph here.
            self.graph = tf.get_default_graph()

        else:
            graph = tf.Graph()
            default_graph = graph.as_default()
            default_graph.__enter__()
            # Global model.
            self.global_model = deepcopy(self)
            self.global_model.distributed_spec['replica_model'] = True
            self.global_model.setup()
            self.graph = graph

        with tf.device(device_name_or_function=self.device):
            # Episode
            collection = self.graph.get_collection(name='episode')
            if len(collection) == 0:
                self.episode = tf.get_variable(name='episode',
                                               dtype=tf.int32,
                                               initializer=0,
                                               trainable=False)
                self.graph.add_to_collection(name='episode',
                                             value=self.episode)
            else:
                assert len(collection) == 1
                self.episode = collection[0]

            # Timestep
            collection = self.graph.get_collection(name='timestep')
            if len(collection) == 0:
                self.timestep = tf.get_variable(name='timestep',
                                                dtype=tf.int32,
                                                initializer=0,
                                                trainable=False)
                self.graph.add_to_collection(name='timestep',
                                             value=self.timestep)
                self.graph.add_to_collection(name=tf.GraphKeys.GLOBAL_STEP,
                                             value=self.timestep)
            else:
                assert len(collection) == 1
                self.timestep = collection[0]

            # Variables and summaries
            self.variables = dict()
            self.all_variables = dict()
            self.registered_variables = set()
            self.summaries = list()

            def custom_getter(getter,
                              name,
                              registered=False,
                              second=False,
                              **kwargs):
                if registered:
                    self.registered_variables.add(name)
                elif name in self.registered_variables:
                    registered = True
                variable = getter(name=name,
                                  **kwargs)  # Top-level, hence no 'registered'
                if not registered:
                    self.all_variables[name] = variable
                    if kwargs.get(
                            'trainable',
                            True) and not name.startswith('optimization'):
                        self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name,
                                                           values=variable)
                            self.summaries.append(summary)
                return variable

            # Create placeholders, tf functions, internals, etc
            self.initialize(custom_getter=custom_getter)

            # Input tensors
            states = self.get_states(states=self.state_inputs)
            internals = [
                tf.identity(input=internal)
                for internal in self.internal_inputs
            ]
            actions = self.get_actions(actions=self.action_inputs)
            terminal = tf.identity(input=self.terminal_input)
            reward = self.get_reward(states=states,
                                     internals=internals,
                                     terminal=terminal,
                                     reward=self.reward_input)

            # Stop gradients for input preprocessing
            states = {
                name: tf.stop_gradient(input=state)
                for name, state in states.items()
            }
            actions = {
                name: tf.stop_gradient(input=action)
                for name, action in actions.items()
            }
            reward = tf.stop_gradient(input=reward)

            # Optimizer
            if self.optimizer is None:
                pass
            elif self.distributed_spec is not None and \
                    not self.distributed_spec.get('parameter_server') and \
                    not self.distributed_spec.get('replica_model'):
                # If not internal global model
                self.optimizer = GlobalOptimizer(optimizer=self.optimizer)
            else:
                self.optimizer = Optimizer.from_spec(spec=self.optimizer)

            # Create output fetch operations
            self.create_output_operations(
                states=states,
                internals=internals,
                actions=actions,
                terminal=terminal,
                reward=reward,
                update=self.update_input,
                deterministic=self.deterministic_input)

            if 'inputs' in self.summary_labels:
                for name, state in states.items():
                    summary = tf.summary.histogram(
                        name=(self.scope + '/inputs/states/' + name),
                        values=state)
                    self.summaries.append(summary)
                for name, action in actions.items():
                    summary = tf.summary.histogram(
                        name=(self.scope + '/inputs/actions/' + name),
                        values=action)
                    self.summaries.append(summary)
                summary = tf.summary.histogram(name=(self.scope +
                                                     '/inputs/reward'),
                                               values=reward)
                self.summaries.append(summary)

        if self.distributed_spec is not None:
            if self.distributed_spec.get('replica_model'):
                # If internal global model
                return

            elif self.distributed_spec.get('parameter_server'):
                server = tf.train.Server(
                    server_or_cluster_def=self.
                    distributed_spec['cluster_spec'],
                    job_name='ps',
                    task_index=self.distributed_spec['task_index'],
                    protocol=self.distributed_spec.get('protocol'),
                    config=None,
                    start=True)
                # Param server does nothing actively
                server.join()
                return

        # Global and local variables initialize operations
        if self.distributed_spec is None:
            global_variables = self.get_variables(include_non_trainable=True)
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(
                var_list=global_variables)
            ready_for_local_init_op = None
            local_init_op = None
        else:
            global_variables = self.global_model.get_variables(
                include_non_trainable=True)
            local_variables = self.get_variables(include_non_trainable=True)
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(
                var_list=(global_variables + local_variables))
            ready_for_local_init_op = tf.report_uninitialized_variables(
                var_list=global_variables)
            local_init_op = tf.group(*(local_var.assign(value=global_var)
                                       for local_var, global_var in zip(
                                           local_variables, global_variables)))

        def init_fn(scaffold, session):
            if self.saver_spec is not None and self.saver_spec.get(
                    'load', True):
                directory = self.saver_spec['directory']
                file = self.saver_spec.get('file')
                if file is None:
                    file = tf.train.latest_checkpoint(
                        checkpoint_dir=directory,
                        latest_filename=
                        None  # Corresponds to argument of saver.save() in Model.save().
                    )
                elif not os.path.isfile(file):
                    file = os.path.join(directory, file)
                if file is not None:
                    scaffold.saver.restore(sess=session, save_path=file)

        # Summary operation
        summaries = self.get_summaries()
        if len(summaries) > 0:
            summary_op = tf.summary.merge(inputs=summaries)
        else:
            summary_op = None

        # TensorFlow saver object
        saver = tf.train.Saver(
            var_list=global_variables,  # should be given?
            reshape=False,
            sharded=False,  # should be true?
            max_to_keep=5,
            keep_checkpoint_every_n_hours=10000.0,
            name=None,
            restore_sequentially=False,
            saver_def=None,
            builder=None,
            defer_build=False,
            allow_empty=True,
            write_version=tf.train.SaverDef.V2,
            pad_step_number=False,
            save_relative_paths=True
            #filename=None
        )

        # TensorFlow scaffold object
        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_feed_dict=None,
            init_fn=init_fn,
            ready_op=ready_op,
            ready_for_local_init_op=ready_for_local_init_op,
            local_init_op=local_init_op,
            summary_op=summary_op,
            saver=saver,
            copy_from_scaffold=None)

        hooks = list()

        # Checkpoint saver hook
        if self.saver_spec is not None and (
                self.distributed_spec is None
                or self.distributed_spec['task_index'] == 0):
            self.saver_directory = self.saver_spec['directory']
            hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir=self.saver_directory,
                    save_secs=self.saver_spec.get(
                        'seconds',
                        None if 'steps' in self.saver_spec else 600),
                    save_steps=self.saver_spec.get(
                        'steps'),  # Either one or the other has to be set.
                    saver=None,  # None since given via 'scaffold' argument.
                    checkpoint_basename=self.saver_spec.get(
                        'basename', 'model.ckpt'),
                    scaffold=self.scaffold,
                    listeners=None))
        else:
            self.saver_directory = None

        # Summary saver hook
        if self.summary_spec is None:
            self.summary_writer_hook = None
        else:
            # TensorFlow summary writer object
            summary_writer = tf.summary.FileWriter(
                logdir=self.summary_spec['directory'],
                graph=self.graph,
                max_queue=10,
                flush_secs=120,
                filename_suffix=None)
            self.summary_writer_hook = util.UpdateSummarySaverHook(
                update_input=self.update_input,
                save_steps=self.summary_spec.get(
                    'steps'),  # Either one or the other has to be set.
                save_secs=self.summary_spec.get(
                    'seconds', None if 'steps' in self.summary_spec else 120),
                output_dir=
                None,  # None since given via 'summary_writer' argument.
                summary_writer=summary_writer,
                scaffold=self.scaffold,
                summary_op=None  # None since given via 'scaffold' argument.
            )
            hooks.append(self.summary_writer_hook)

        # Stop at step hook
        # hooks.append(tf.train.StopAtStepHook(
        #     num_steps=???,  # This makes more sense, if load and continue training.
        #     last_step=None  # Either one or the other has to be set.
        # ))

        # # Step counter hook
        # hooks.append(tf.train.StepCounterHook(
        #     every_n_steps=counter_config.get('steps', 100),  # Either one or the other has to be set.
        #     every_n_secs=counter_config.get('secs'),  # Either one or the other has to be set.
        #     output_dir=None,  # None since given via 'summary_writer' argument.
        #     summary_writer=summary_writer
        # ))

        # Other available hooks:
        # tf.train.FinalOpsHook(final_ops, final_ops_feed_dict=None)
        # tf.train.GlobalStepWaiterHook(wait_until_step)
        # tf.train.LoggingTensorHook(tensors, every_n_iter=None, every_n_secs=None)
        # tf.train.NanTensorHook(loss_tensor, fail_on_nan_loss=True)
        # tf.train.ProfilerHook(save_steps=None, save_secs=None, output_dir='', show_dataflow=True, show_memory=False)

        if self.distributed_spec is None:
            # TensorFlow non-distributed monitored session object
            self.monitored_session = tf.train.SingularMonitoredSession(
                hooks=hooks,
                scaffold=self.scaffold,
                master='',  # Default value.
                config=self.session_config,  # always the same?
                checkpoint_dir=None)

        else:
            server = tf.train.Server(
                server_or_cluster_def=self.distributed_spec['cluster_spec'],
                job_name='worker',
                task_index=self.distributed_spec['task_index'],
                protocol=self.distributed_spec.get('protocol'),
                config=self.session_config,
                start=True)

            if self.distributed_spec['task_index'] == 0:
                # TensorFlow chief session creator object
                session_creator = tf.train.ChiefSessionCreator(
                    scaffold=self.scaffold,
                    master=server.target,
                    config=self.session_config,
                    checkpoint_dir=None,
                    checkpoint_filename_with_path=None)
            else:
                # TensorFlow worker session creator object
                session_creator = tf.train.WorkerSessionCreator(
                    scaffold=self.scaffold,
                    master=server.target,
                    config=self.session_config,
                )

            # TensorFlow monitored session object
            self.monitored_session = tf.train.MonitoredSession(
                session_creator=session_creator,
                hooks=hooks,
                stop_grace_period_secs=120  # Default value.
            )

        if default_graph:
            default_graph.__exit__(None, None, None)
        self.graph.finalize()
        self.monitored_session.__enter__()
        self.session = self.monitored_session._tf_sess()
Exemple #4
0
class Model(object):
    """
    Base class for all (TensorFlow-based) models.
    """
    def __init__(self, states_spec, actions_spec, config, **kwargs):

        # States and actions specifications
        self.states_spec = states_spec
        self.actions_spec = actions_spec

        # Discount factor
        self.discount = config.discount

        # Reward normalization
        assert isinstance(config.normalize_rewards, bool)
        self.normalize_rewards = config.normalize_rewards

        # Variable noise
        assert config.variable_noise is None or config.variable_noise > 0.0
        self.variable_noise = config.variable_noise

        # TensorFlow summaries
        self.summary_labels = set(config.summary_labels or ())

        # Variables and summaries
        self.variables = dict()
        self.all_variables = dict()
        self.summaries = list()

        if not config.local_model or not config.replica_model:
            # If not local_model mode or not internal global model
            self.default_graph = tf.Graph().as_default()
            self.graph = self.default_graph.__enter__()

        if config.cluster_spec is None:
            if config.parameter_server or config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.parameter_server:
            if config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.replica_model:
            self.device = tf.train.replica_device_setter(
                worker_device=config.device, cluster=config.cluster_spec)
            self.global_model = None

        elif config.local_model:
            if config.replica_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device

            global_config = config.copy()
            global_config.set(key='replica_model', value=True)

            self.global_model = self.__class__(states_spec=states_spec,
                                               actions_spec=actions_spec,
                                               config=global_config,
                                               **kwargs)

        else:
            raise TensorForceError(
                "Invalid config value for distributed mode.")

        with tf.device(device_name_or_function=self.device):

            # Timestep and episode
            # TODO: various modes !!!
            if self.global_model is None:
                # TODO: Variables seem to re-initialize in the beginning every time a runner starts
                self.timestep = tf.get_variable(name='timestep',
                                                dtype=tf.int32,
                                                initializer=0,
                                                trainable=False)
                self.episode = tf.get_variable(name='episode',
                                               dtype=tf.int32,
                                               initializer=0,
                                               trainable=False)
            else:
                self.timestep = self.global_model.timestep
                self.episode = self.global_model.episode

            with tf.name_scope(name=config.scope):

                def custom_getter(getter, name, registered=False, **kwargs):
                    variable = getter(
                        name=name,
                        **kwargs)  # Top-level, hence no 'registered'
                    if not registered:
                        self.all_variables[name] = variable
                        if kwargs.get(
                                'trainable',
                                True) and not name.startswith('optimization'):
                            self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name,
                                                           values=variable)
                            self.summaries.append(summary)
                    return variable

                # Create placeholders, tf functions, internals, etc
                self.initialize(custom_getter=custom_getter)

                # Input tensors
                states = self.get_states(states=self.state_inputs)
                internals = [
                    tf.identity(input=internal)
                    for internal in self.internal_inputs
                ]
                actions = self.get_actions(actions=self.action_inputs)
                terminal = tf.identity(input=self.terminal_input)
                reward = self.get_reward(states=states,
                                         internals=internals,
                                         terminal=terminal,
                                         reward=self.reward_input)

                # Stop gradients for input preprocessing
                states = {
                    name: tf.stop_gradient(input=state)
                    for name, state in states.items()
                }
                actions = {
                    name: tf.stop_gradient(input=action)
                    for name, action in actions.items()
                }
                reward = tf.stop_gradient(input=reward)

                # Optimizer
                if config.optimizer is None:
                    self.optimizer = None
                elif config.local_model and not config.replica_model:
                    # If local_model mode and not internal global model
                    self.optimizer = GlobalOptimizer(
                        optimizer=config.optimizer)
                else:
                    self.optimizer = Optimizer.from_spec(spec=config.optimizer)

                # Create output fetch operations
                self.create_output_operations(states=states,
                                              internals=internals,
                                              actions=actions,
                                              terminal=terminal,
                                              reward=reward,
                                              deterministic=self.deterministic)

        if config.local_model and config.replica_model:
            # If local_model mode and internal global model
            return

        # Local and global initialize operations
        if config.local_model:
            init_op = tf.variables_initializer(
                var_list=self.global_model.get_variables(
                    include_non_trainable=True))
            local_init_op = tf.variables_initializer(
                var_list=self.get_variables(include_non_trainable=True))

        else:
            init_op = tf.variables_initializer(var_list=self.get_variables(
                include_non_trainable=True))
            local_init_op = None

        # Summary operation
        if len(self.get_summaries()) > 0:
            summary_op = tf.summary.merge(inputs=self.get_summaries())
        else:
            summary_op = None

        # TODO: MonitoredSession or so?
        self.supervisor = tf.train.Supervisor(
            is_chief=(config.task_index == 0),
            init_op=init_op,
            local_init_op=local_init_op,
            logdir=config.model_directory,
            summary_op=summary_op,
            global_step=self.timestep,
            save_summaries_secs=config.summary_frequency,
            save_model_secs=config.save_frequency
            # checkpoint_basename='model.ckpt'
            # session_manager=None
        )

        # tf.ConfigProto(device_filters=['/job:ps', '/job:worker/task:{}/cpu:0'.format(self.task_index)])
        if config.parameter_server:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='ps',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            # Param server does nothing actively
            self.server.join()

        elif config.cluster_spec is not None:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='worker',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            self.managed_session = self.supervisor.managed_session(
                master=self.server.target,
                start_standard_services=(config.model_directory is not None))
            self.session = self.managed_session.__enter__()

        else:
            self.managed_session = self.supervisor.managed_session(
                start_standard_services=(config.model_directory is not None))
            self.session = self.managed_session.__enter__()

    def close(self):
        self.managed_session.__exit__(None, None, None)
        self.supervisor.stop()
        self.default_graph.__exit__(None, None, None)

    def initialize(self, custom_getter):
        """
        Creates the TensorFlow placeholders and functions for this model. Moreover adds the internal state
        placeholders and initialization values to the model.

        Args:
            custom_getter: The `custom_getter_` object to use for `tf.make_template` when creating TensorFlow functions.
        """
        # States
        self.state_inputs = dict()
        for name, state in self.states_spec.items():
            self.state_inputs[name] = tf.placeholder(
                dtype=util.tf_dtype(state['type']),
                shape=(None, ) + tuple(state['shape']),
                name=name)

        # Actions
        self.action_inputs = dict()
        for name, action in self.actions_spec.items():
            self.action_inputs[name] = tf.placeholder(
                dtype=util.tf_dtype(action['type']),
                shape=(None, ) + tuple(action['shape']),
                name=name)

        # Terminal
        self.terminal_input = tf.placeholder(dtype=tf.bool,
                                             shape=(None, ),
                                             name='terminal')

        # Reward
        self.reward_input = tf.placeholder(dtype=tf.float32,
                                           shape=(None, ),
                                           name='reward')

        # Internal states
        self.internal_inputs = list()
        self.internal_inits = list()

        # Deterministic action flag
        self.deterministic = tf.placeholder(dtype=tf.bool,
                                            shape=(),
                                            name='deterministic')

        # TensorFlow functions
        self.fn_discounted_cumulative_reward = tf.make_template(
            name_='discounted-cumulative-reward',
            func_=self.tf_discounted_cumulative_reward,
            custom_getter_=custom_getter)
        self.fn_actions_and_internals = tf.make_template(
            name_='actions-and-internals',
            func_=self.tf_actions_and_internals,
            custom_getter_=custom_getter)
        self.fn_loss_per_instance = tf.make_template(
            name_='loss-per-instance',
            func_=self.tf_loss_per_instance,
            custom_getter_=custom_getter)
        self.fn_regularization_losses = tf.make_template(
            name_='regularization-losses',
            func_=self.tf_regularization_losses,
            custom_getter_=custom_getter)
        self.fn_loss = tf.make_template(name_='loss',
                                        func_=self.tf_loss,
                                        custom_getter_=custom_getter)
        self.fn_optimization = tf.make_template(name_='optimization',
                                                func_=self.tf_optimization,
                                                custom_getter_=custom_getter)
        # self.fn_summarization = tf.make_template(
        #     name_='summarization',
        #     func_=self.tf_summarization,
        #     custom_getter_=custom_getter
        # )

    def get_states(self, states):
        # TODO: preprocessing could go here?
        return {
            name: tf.identity(input=state)
            for name, state in states.items()
        }

    def get_actions(self, actions):
        # TODO: preprocessing could go here?
        return {
            name: tf.identity(input=action)
            for name, action in actions.items()
        }

    def get_reward(self, states, internals, terminal, reward):
        if self.normalize_rewards:
            mean, variance = tf.nn.moments(x=reward, axes=0)
            return (reward - mean) / tf.maximum(x=variance, y=util.epsilon)
        else:
            return tf.identity(input=reward)

    def tf_discounted_cumulative_reward(self,
                                        terminal,
                                        reward,
                                        discount,
                                        final_reward=0.0):
        """
        Creates the TensorFlow operations for calculating the discounted cumulative rewards
        for a given sequence of rewards.

        Args:
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            discount: Discount factor.
            final_reward: Last reward value in the sequence.

        Returns:
            Discounted cumulative reward tensor.
        """

        # TODO: n-step cumulative reward (particularly for envs without terminal)

        def cumulate(cumulative, reward_and_terminal):
            rew, term = reward_and_terminal
            return tf.where(condition=term,
                            x=rew,
                            y=(rew + cumulative * discount))

        # Reverse since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right
        reward = tf.reverse(tensor=reward, axis=(0, ))
        terminal = tf.reverse(tensor=terminal, axis=(0, ))

        reward = tf.scan(fn=cumulate,
                         elems=(reward, terminal),
                         initializer=final_reward)

        return tf.reverse(tensor=reward, axis=(0, ))

    def tf_actions_and_internals(self, states, internals, deterministic):
        """
        Creates the TensorFlow operations for retrieving the actions (and posterior internal states)
        in reaction to the given input states (and prior internal states).

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            deterministic: If true, the action is chosen deterministically.

        Returns:
            Actions and list of posterior internal state tensors.
        """
        raise NotImplementedError

    def tf_loss_per_instance(self, states, internals, actions, terminal,
                             reward):
        """
        Creates the TensorFlow operations for calculating the loss per batch instance
        of the given input states and actions.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.

        Returns:
            Loss tensor.
        """
        raise NotImplementedError

    def tf_regularization_losses(self, states, internals):
        """
        Creates the TensorFlow operations for calculating the regularization losses for the given input states.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.

        Returns:
            Dict of regularization loss tensors.
        """
        return dict()

    def tf_loss(self, states, internals, actions, terminal, reward):
        # Mean loss per instance
        loss_per_instance = self.fn_loss_per_instance(states=states,
                                                      internals=internals,
                                                      actions=actions,
                                                      terminal=terminal,
                                                      reward=reward)
        loss = tf.reduce_mean(input_tensor=loss_per_instance, axis=0)

        # Loss without regularization summary
        if 'losses' in self.summary_labels:
            summary = tf.summary.scalar(name='loss-without-regularization',
                                        tensor=loss)
            self.summaries.append(summary)

        # Regularization losses
        losses = self.fn_regularization_losses(states=states,
                                               internals=internals)
        if len(losses) > 0:
            loss += tf.add_n(inputs=list(losses.values()))

        # Total loss summary
        if 'losses' in self.summary_labels or 'total-loss' in self.summary_labels:
            summary = tf.summary.scalar(name='total-loss', tensor=loss)
            self.summaries.append(summary)

        return loss

    def get_optimizer_kwargs(self, states, internals, actions, terminal,
                             reward):
        """
        Returns the optimizer arguments including the time, the list of variables to optimize,
        and various argument-free functions (in particular `fn_loss` returning the combined
        0-dim batch loss tensor) which the optimizer might require to perform an update step.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.

        Returns:
            Loss tensor of the size of the batch.
        """
        kwargs = dict()
        kwargs['time'] = self.timestep
        kwargs['variables'] = self.get_variables()
        kwargs['fn_loss'] = (lambda: self.fn_loss(states=states,
                                                  internals=internals,
                                                  actions=actions,
                                                  terminal=terminal,
                                                  reward=reward))
        if self.global_model is not None:
            kwargs['global_variables'] = self.global_model.get_variables()
        return kwargs

    def tf_optimization(self, states, internals, actions, terminal, reward):
        """
        Creates the TensorFlow operations for performing an optimization update step based
        on the given input states and actions batch.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.

        Returns:
            The optimization operation.
        """
        if self.optimizer is None:
            return tf.no_op()
        else:
            optimizer_kwargs = self.get_optimizer_kwargs(states=states,
                                                         internals=internals,
                                                         actions=actions,
                                                         terminal=terminal,
                                                         reward=reward)
            return self.optimizer.minimize(**optimizer_kwargs)

    def create_output_operations(self, states, internals, actions, terminal,
                                 reward, deterministic):
        """
        Calls all the relevant TensorFlow functions for this model and hence creates all the
        TensorFlow operations involved.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            deterministic: If true, the action is chosen deterministically.
        """

        # Create graph by calling the functions corresponding to model.act() / model.update(), to initialize variables.
        # TODO: Could call reset here, but would have to move other methods below reset.
        self.fn_actions_and_internals(states=states,
                                      internals=internals,
                                      deterministic=deterministic)
        self.fn_loss_per_instance(states=states,
                                  internals=internals,
                                  actions=actions,
                                  terminal=terminal,
                                  reward=reward)

        # Tensor fetched for model.act()
        operations = list()
        if self.variable_noise is not None and self.variable_noise > 0.0:
            # Add variable noise
            noise_deltas = list()
            for variable in self.get_variables():
                noise_delta = tf.random_normal(shape=util.shape(variable),
                                               mean=0.0,
                                               stddev=self.variable_noise)
                noise_deltas.append(noise_delta)
                operations.append(variable.assign_add(delta=noise_delta))

        # Retrieve actions and internals
        with tf.control_dependencies(control_inputs=operations):
            self.actions_internals_timestep = self.fn_actions_and_internals(
                states=states,
                internals=internals,
                deterministic=deterministic)

        # Increment timestep
        increment_timestep = tf.shape(input=next(iter(states.values())))[0]
        increment_timestep = self.timestep.assign_add(delta=increment_timestep)
        operations = [increment_timestep]

        # Subtract variable noise
        if self.variable_noise is not None and self.variable_noise > 0.0:
            for variable, noise_delta in zip(self.get_variables(),
                                             noise_deltas):
                operations.append(variable.assign_sub(delta=noise_delta))

        with tf.control_dependencies(control_inputs=operations):
            # Trivial operation to enforce control dependency
            self.actions_internals_timestep += (self.timestep + 0, )

        # Tensor fetched for model.observe()
        increment_episode = tf.count_nonzero(input_tensor=terminal,
                                             dtype=tf.int32)
        increment_episode = self.episode.assign_add(delta=increment_episode)
        # TODO: add up rewards per episode and add summary_label 'episode-reward'
        with tf.control_dependencies(control_inputs=(increment_episode, )):
            self.episode_increment = tf.no_op()

        # Tensor(s) fetched for model.update()
        self.optimization = self.fn_optimization(states=states,
                                                 internals=internals,
                                                 actions=actions,
                                                 terminal=terminal,
                                                 reward=reward)
        self.loss_per_instance = self.fn_loss_per_instance(states=states,
                                                           internals=internals,
                                                           actions=actions,
                                                           terminal=terminal,
                                                           reward=reward)

    def get_variables(self, include_non_trainable=False):
        """
        Returns the TensorFlow variables used by the model.

        Returns:
            List of variables.
        """

        if include_non_trainable:
            # optimizer variables and timestep/episode only included if 'include_non_trainable' set
            model_variables = [
                self.all_variables[key] for key in sorted(self.all_variables)
            ]

            if self.optimizer is None:
                return model_variables + [self.timestep, self.episode]

            else:
                optimizer_variables = self.optimizer.get_variables()
                return model_variables + optimizer_variables + [
                    self.timestep, self.episode
                ]

        else:
            return [self.variables[key] for key in sorted(self.variables)]

    def get_summaries(self):
        """
        Returns the TensorFlow summaries reported by the model

        Returns:
            List of summaries
        """
        return self.summaries

    def reset(self):
        """
        Resets the model to its initial state.

        Returns:
            A list containing the internal states initializations.
        """
        return list(self.internal_inits)

    def act(self, states, internals, deterministic=False):
        name = next(iter(self.states_spec))
        batched = (states[name].ndim != len(self.states_spec[name]['shape']))

        fetches = list(self.actions_internals_timestep)

        if batched:
            feed_dict = {
                state_input: states[name]
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: internals[n]
                for n, internal_input in enumerate(self.internal_inputs)
            })
        else:
            feed_dict = {
                state_input: (states[name], )
                for name, state_input in self.state_inputs.items()
            }
            feed_dict.update({
                internal_input: (internals[n], )
                for n, internal_input in enumerate(self.internal_inputs)
            })

        feed_dict[self.deterministic] = deterministic

        actions, internals, timestep = self.session.run(fetches=fetches,
                                                        feed_dict=feed_dict)

        if not batched:
            actions = {name: action[0] for name, action in actions.items()}
            internals = [internal[0] for internal in internals]

        return actions, internals, timestep

    def observe(self, terminal, reward, batched=False):
        fetches = [self.episode_increment, self.episode]

        if batched:
            feed_dict = {
                self.terminal_input: terminal,
                self.reward_input: reward
            }
        else:
            feed_dict = {
                self.terminal_input: (terminal, ),
                self.reward_input: (reward, )
            }

        _, episode = self.session.run(fetches=fetches, feed_dict=feed_dict)

        return episode

    def update(self, batch, return_loss_per_instance=False):
        fetches = [self.optimization]

        # Optionally fetch loss per instance
        if return_loss_per_instance:
            fetches.append(self.loss_per_instance)

        feed_dict = {
            state_input: batch['states'][name]
            for name, state_input in self.state_inputs.items()
        }
        feed_dict.update({
            internal_input: batch['internals'][n]
            for n, internal_input in enumerate(self.internal_inputs)
        })
        feed_dict.update({
            action_input: batch['actions'][name]
            for name, action_input in self.action_inputs.items()
        })
        feed_dict[self.terminal_input] = batch['terminal']
        feed_dict[self.reward_input] = batch['reward']

        # if self.distributed:
        #     fetches.extend(self.increment_global_episode for terminal in batch['terminals'] if terminal)

        fetched = self.session.run(fetches=fetches, feed_dict=feed_dict)

        if return_loss_per_instance:
            return fetched[1]

    def load_model(self, path):
        """
        Import model from path using tf.train.Saver.

        Args:
            path: Path to checkpoint

        Returns:

        """
        self.saver.restore(sess=self.session, save_path=path)

    def save_model(self, path, use_global_step=True):
        """
        Export model using a tf.train.Saver. Optionally append current time step as to not
        overwrite previous checkpoint file. Set to 'false' to be able to load model
        from exact path it was saved to in case of restarting program.

        Args:
            path: Model export directory
            use_global_step: Whether to append the current timestep to the checkpoint path.

        Returns:

        """
        if use_global_step:
            self.saver.save(sess=self.session,
                            save_path=path,
                            global_step=self.timestep)  # TODO: global_step?
        else:
            self.saver.save(sess=self.session, save_path=path)

        if self.summary_writer is not None:
            self.summary_writer.flush()