Example #1
0
    def __init__(self, *preprocessors, **kwargs):
        """
        Args:
            preprocessors (PreprocessorLayer): The PreprocessorLayers to add to the Stack and connect to each other.

        Keyword Args:
            fold_time_rank (bool): Whether to fold the time rank for the `preprocess` API-method stack.
            unfold_time_rank (bool): Whether to unfold the time rank for the `preprocess` API-method stack.

        Raises:
            RLGraphError: If a sub-component is not a PreprocessLayer object.
        """
        self.fold_time_rank = kwargs.get("fold_time_rank", False)
        self.unfold_time_rank = kwargs.get("unfold_time_rank", False)
        # Link sub-Components' `call` methods together to yield PreprocessorStack's `preprocess` method.
        # NOTE: Do not include `reset` here as it is defined explicitly below.
        kwargs["api_methods"] = [
            dict(api="preprocess",
                 component_api="call",
                 fold_time_rank=self.fold_time_rank,
                 unfold_time_rank=self.unfold_time_rank)
        ]
        default_dict(kwargs,
                     dict(scope=kwargs.pop("scope", "preprocessor-stack")))
        super(PreprocessorStack, self).__init__(*preprocessors, **kwargs)
Example #2
0
    def __init__(self, *preprocessors, **kwargs):
        """
        Args:
            preprocessors (PreprocessorLayer): The PreprocessorLayers to add to the Stack and connect to each other.

        Raises:
            RLGraphError: If a sub-component is not a PreprocessLayer object.
        """
        # Link sub-Components' `apply` methods together to yield PreprocessorStack's `preprocess` method.
        # NOTE: Do not include `reset` here as it is defined explicitly below.
        kwargs["api_methods"] = {("preprocess", "apply")}
        default_dict(kwargs,
                     dict(scope=kwargs.pop("scope", "preprocessor-stack")))
        super(PreprocessorStack, self).__init__(*preprocessors, **kwargs)
Example #3
0
    def test_impala_actor_compilation(self):
        """
        Tests IMPALA agent compilation (actor).
        """
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path(
            "configs/impala_agent_for_deepmind_lab_env.json")
        env_spec = dict(level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        actor_agent = IMPALAAgent.from_spec(
            agent_config,
            type="actor",
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256, )),
                                        FloatBox(shape=(256, )),
                                        add_batch_rank=False),
            environment_spec=default_dict(dict(type="deepmind-lab"), env_spec),
            # Make session-creation hang in docker.
            execution_spec=dict(disable_monitoring=True))
        # Start Specifiable Server with Env manually (monitoring is disabled).
        actor_agent.environment_stepper.environment_server.start_server()
        print("Compiled IMPALA type=actor agent.")
        actor_agent.environment_stepper.environment_server.stop_server()
        actor_agent.terminate()
Example #4
0
def parse_update_spec(update_spec):
    """
    Parses update/learning parameters and inserts default values where necessary.

    Args:
        update_spec (Optional[dict]): Update/Learning spec dict.

    Returns:
        dict: The sanitized update_spec dict.
    """
    # If no spec given.
    default_spec = dict(
        # Whether to perform calls to `Agent.update()` at all.
        do_updates=True,
        # The unit in which we measure frequency: one of "timesteps", "episodes", "sec".
        # unit="timesteps", # TODO: not supporting any other than timesteps
        # The number of 'units' to wait before we do any updating at all.
        steps_before_update=0,
        # The frequency with which we update (given in `unit`).
        update_interval=4,
        # The number of consecutive `Agent.update()` calls per update.
        update_steps=1,
        # The batch size with which to update (e.g. when pulling records from a memory).
        batch_size=64,
        sync_interval=128)
    update_spec = default_dict(update_spec, default_spec)

    return update_spec
Example #5
0
def parse_observe_spec(observe_spec):
    """
    Parses parameters for `Agent.observe()` calls and inserts default values where necessary.

    Args:
        observe_spec (Optional[dict]): Observe spec dict.

    Returns:
        dict: The sanitized observe_spec dict.
    """
    # If no spec given.
    default_spec = dict(
        # Do we buffer observations in python before sending them through the graph?
        buffer_enabled=True,
        # Fill buffer with n records before sending them through the graph.
        buffer_size=100,  # only if buffer_enabled=True
        # Set to > 1 if we want to post-process buffered values for n-step learning.
        n_step=
        1,  # values > 1 are only allowed if buffer_enabled is True and buffer_size >> n.
    )
    observe_spec = default_dict(observe_spec, default_spec)

    if observe_spec["n_step"] > 1:
        if observe_spec["buffer_enabled"] is False:
            raise RLGraphError(
                "Cannot setup observations with n-step (n={}), while buffering is switched "
                "off".format(observe_spec["n_step"]))
        elif observe_spec["buffer_size"] < 3 * observe_spec["n_step"]:
            raise RLGraphError(
                "Buffer must be at least 3x as large as n-step (n={}, min-buffer={})!"
                .format(observe_spec["n_step"], 3 * observe_spec["n_step"]))

    return observe_spec
Example #6
0
def parse_saver_spec(saver_spec):
    """
    Parses the saver spec. Returns None if input None, otherwise
    provides default parameters.

    Args:
        saver_spec (Union[None, dict]): Saver parameters.

    Returns:
        Union(dict, None): Saver spec or None.
    """

    if saver_spec is None:
        return None
    default_spec = dict(
        # The directory in which to store model checkpoint files.
        directory=os.path.expanduser(
            "~/rlgraph_checkpoints/"),  # default=home dir
        # The base file name for a saved checkpoint.
        checkpoint_basename="model.ckpt",
        # How many files to maximally store for one graph.
        max_checkpoints=5,
        # Every how many seconds do we save? None if saving frequency should be step based.
        save_secs=600,
        # Every how many steps do we save? None if saving frequency should be time (seconds) based.
        save_steps=None)
    return default_dict(saver_spec, default_spec)
Example #7
0
    def test_impala_learner_compilation(self):
        """
        Tests IMPALA agent compilation (learner).
        """
        return
        if get_backend() == "pytorch":
            return
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path(
            "configs/impala_agent_for_deepmind_lab_env.json")
        env_spec = dict(level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        learner_agent = IMPALAAgent.from_spec(
            agent_config,
            type="learner",
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            internal_states_space=IMPALAAgent.default_internal_states_space,
            environment_spec=default_dict(dict(type="deepmind-lab"), env_spec),
            # Setup distributed tf.
            execution_spec=dict(
                mode="distributed",
                #gpu_spec=dict(
                #    gpus_enabled=True,
                #    max_usable_gpus=1,
                #    num_gpus=1
                #),
                distributed_spec=dict(job="learner",
                                      task_index=0,
                                      cluster_spec=self.impala_cluster_spec),
                session_config=dict(type="monitored-training-session",
                                    allow_soft_placement=True,
                                    log_device_placement=True,
                                    auto_start=False),
                disable_monitoring=True,
                enable_timeline=True,
            ))
        print(
            "Compiled IMPALA type=learner agent without starting the session (would block waiting for actor)."
        )

        ## Take one batch from the filled up queue and run an update_from_memory with the learner.
        #update_steps = 10
        #time_start = time.perf_counter()
        #for _ in range(update_steps):
        #    agent.call_api_method("update_from_memory")
        #time_total = time.perf_counter() - time_start
        #print("Done learning {}xbatch-of-{} in {}sec ({} updates/sec).".format(
        #    update_steps, agent.update_spec["batch_size"], time_total , update_steps / time_total)
        #)

        learner_agent.terminate()
Example #8
0
    def __init__(self, preprocessors, **kwargs):
        """
        Args:
            preprocessors (dict):

        Raises:
            RLGraphError: If a sub-component is not a PreprocessLayer object.
        """
        # Create one separate PreprocessorStack per given key.
        # All possibly other keys in an input will be pass through un-preprocessed.
        self.flattened_preprocessors = flatten_op(preprocessors)
        for i, (flat_key, spec) in enumerate(self.flattened_preprocessors.items()):
            self.flattened_preprocessors[flat_key] = PreprocessorStack.from_spec(
                spec, scope="preprocessor-stack-{}".format(i)
            )

        # NOTE: No automatic API-methods. Define them all ourselves.
        kwargs["api_methods"] = {}
        default_dict(kwargs, dict(scope=kwargs.pop("scope", "dict-preprocessor-stack")))
        super(DictPreprocessorStack, self).__init__(*list(self.flattened_preprocessors.values()), **kwargs)
Example #9
0
    def __init__(self,
                 level_id,
                 observations="RGB_INTERLEAVED",
                 actions=None,
                 frameskip=4,
                 config=None,
                 renderer="software",
                 seed=None,
                 level_cache=None):
        """
        Args:
            level_id (str): Specifier of the level to play, e.g. 'seekavoid_arena_01'.
            observations (Union[str,List[str]]): String specifier(s) for the observation(s) to be used with the
                given level. Will be converted into either a (single) BoxSpace or a Tuple (of BoxSpaces).
                See deepmind's documentation for all available observations.
            actions (Optional[List[dict]]): The RLgraph action spec (currently, only IntBox (shape=()) RLgraph action
                spaces are supported) that will be translated from and to the deepmind Lab actions.
                List slots correspond to the single int-actions, list items are dicts with:
                key=deepmind Lab partial action name e.g. LOOK_LEFT_RIGHT_PIXELS_PER_FRAME.
                value=the value for that deepmind Lab partial action e.g. -100.
            frameskip (Optional[Tuple[int,int],int]): How many frames should be skipped with (repeated action and
                accumulated reward). E.g. (2,5) -> Uniformly pull from set [2,3,4].
                Default: 4.
            config (Optional[dict]): The `config` parameter to be passed into the Lab's constructor.
                Supports 'width', 'height', 'fps', and other useful parameters.
                Values must be given as string values. e.g. dict(width='96')
            renderer (str): The `renderer` parameter to be passed into the Lab's constructor.
            seed (Optional[int]): An optional seed to use to initialize a numpy random state object, which is then used
                to seed all occurring resets in a deterministic fashion.
            level_cache (Optional[object]): An optional custom level caching object to help increase performance
                when playing many repeating levels. Will be passed as is into the Lab's constructor.
        """
        # Create the wrapped deepmind lab level object.
        self.level_id = level_id
        observations = force_list(observations)
        config = default_dict(config, dict(width='96', height='72',
                                           fps='60'))  # Default config.
        self.level = deepmind_lab.Lab(self.level_id,
                                      observations,
                                      config=config,
                                      renderer=renderer,
                                      level_cache=level_cache)

        # Dict mapping a discrete action (int) - we don't support continuous actions yet - into a
        # deepmind Lab action vector.
        self.action_list, action_space = self.define_actions(actions)
        observation_space = self.define_observations(observations)
        super(DeepmindLabEnv, self).__init__(observation_space, action_space)

        self.frameskip = frameskip
        self.random_state = np.random.RandomState(
            seed=seed or int(time.time()))
        self.reset()
Example #10
0
def parse_summary_spec(summary_spec):
    """
    Expands summary spec with default values where necessary.

    Args:
        summary_spec (dict): Summary options.

    Returns:
        dict: Summary spec updated with default values.
    """
    default_spec = dict(
        # The directory in which to store the summary files.
        directory=os.path.expanduser(
            "~/rlgraph_summaries/"),  # default=home dir
        # A regexp pattern that a summary op (including its global scope) has to match in order for it to
        # be included in the graph's summaries.
        summary_regexp="",
        # Every how many seconds do we save a summary? None if saving frequency should be step based.
        save_secs=120,
        # Every how many steps do we save a summary? None if saving frequency should be time (seconds) based.
        save_steps=None)
    return default_dict(summary_spec, default_spec)
Example #11
0
# Core.
from rlgraph.components.component import Component
# Component child-classes.
from rlgraph.components.distributions import *
from rlgraph.components.explorations import Exploration, EpsilonExploration
from rlgraph.components.layers import *
from rlgraph.components.loss_functions import *
from rlgraph.components.memories import *
from rlgraph.components.neural_networks import *
from rlgraph.components.optimizers import *
from rlgraph.components.policies import *
from rlgraph.components.common import *

from rlgraph.utils.util import default_dict
Component.__lookup_classes__ = dict()

# Add all specific sub-classes to this one.
default_dict(Component.__lookup_classes__, Distribution.__lookup_classes__)
default_dict(Component.__lookup_classes__, Layer.__lookup_classes__)
default_dict(Component.__lookup_classes__, Stack.__lookup_classes__)
default_dict(Component.__lookup_classes__, LossFunction.__lookup_classes__)
default_dict(Component.__lookup_classes__, Memory.__lookup_classes__)
default_dict(Component.__lookup_classes__, NeuralNetwork.__lookup_classes__)
default_dict(Component.__lookup_classes__, Optimizer.__lookup_classes__)
default_dict(Component.__lookup_classes__, Policy.__lookup_classes__)


__all__ = ["Component"] + \
          list(set(map(lambda x: x.__name__, Component.__lookup_classes__.values())))

Example #12
0
    def execute_timesteps(self, num_timesteps, max_timesteps_per_episode=0, update_spec=None, use_exploration=True,
                          frameskip=None, reset=True):
        """
        Args:
            num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or
                `num_episodes` must be provided.
            use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based)
                when picking actions. Default: True.
            max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode.
                Use None or 0 for no limit. Default: None.
            update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts.
                Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict.
                See `input_parsing/parse_update_spec.py` for more details.
            frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent.
                Rewards are accumulated over the number of skips. Use None for the Worker's default value.
            reset (bool): Whether to reset the environment and all the Worker's internal counters.
                Default: True.

        Returns:
            dict: Execution statistics.
        """
        # Are we updating or just acting/observing?
        update_spec = default_dict(update_spec, self.agent.update_spec)
        self.set_update_schedule(update_spec)

        num_timesteps = num_timesteps or 0
        max_timesteps_per_episode = max_timesteps_per_episode or 0

        # Stats.
        timesteps_executed = 0
        episodes_executed = 0

        start = time.perf_counter()
        if reset is True:
            self.env_frames = 0
            #self.finished_episode_rewards = list()
            self.finished_episode_steps = list()

            #self.episode_returns = 0
            self.episode_timesteps = 0

            # TODO: Fix for vectorized Envs.
            self.agent.call_api_method("reset")

        # Only run everything for at most num_timesteps (if defined).
        while not (0 < num_timesteps <= timesteps_executed):
            # TODO right now everything comes back as single-env.
            out = self.agent.call_api_method(("perform_n_steps_and_insert_into_fifo", None, [0]))
            timesteps_executed += self.agent.worker_sample_size

            # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip.
            #rewards = out[2]
            terminals = out[3][1:]

            self.env_frames += self.frameskip * self.agent.worker_sample_size

            # Only render once per action.
            #if self.render:
            #    self.vector_env.environments[0].render()

            #for i in range_(self.num_environments):
            #    #self.episode_timesteps[i] += self.agent.worker_sample_size

            for j, terminal in enumerate(terminals):  # TODO: <- [i]
                self.episode_timesteps += 1

                if 0 < max_timesteps_per_episode <= self.episode_timesteps:
                    terminal = True

                if terminal:
                    episodes_executed += 1
                    self.finished_episode_steps.append(self.episode_timesteps)
                    self.logger.info("Finished episode: actions={}.".format(self.episode_timesteps))
                    self.episode_timesteps = 0

            num_timesteps_reached = (0 < num_timesteps <= timesteps_executed)

            if num_timesteps_reached:
                break

        total_time = (time.perf_counter() - start) or 1e-10

        # Return values for current episode(s) if None have been completed.
        #if len(self.finished_episode_rewards) == 0:
        #    #mean_episode_runtime = 0
        #    mean_episode_reward = np.mean(self.episode_returns)
        #    max_episode_reward = np.max(self.episode_returns)
        #    final_episode_reward = self.episode_returns[0]
        #else:
        #    #mean_episode_runtime = np.mean(self.finished_episode_durations)
        #    mean_episode_reward = np.mean(self.finished_episode_rewards)
        #    max_episode_reward = np.max(self.finished_episode_rewards)
        #    final_episode_reward = self.finished_episode_rewards[-1]

        results = dict(
            runtime=total_time,
            # Agent act/observe throughput.
            timesteps_executed=timesteps_executed,
            ops_per_second=(timesteps_executed / total_time),
            # Env frames including action repeats.
            env_frames=self.env_frames,
            env_frames_per_second=(self.env_frames / total_time),
            episodes_executed=episodes_executed,
            episodes_per_minute=(episodes_executed/(total_time / 60)),
            #mean_episode_runtime=mean_episode_runtime,
            #mean_episode_reward=mean_episode_reward,
            #max_episode_reward=max_episode_reward,
            #final_episode_reward=final_episode_reward
        )

        # Total time of run.
        self.logger.info("Finished execution in {} s".format(total_time))
        # Total (RL) timesteps (actions) done (and timesteps/sec).
        self.logger.info("Time steps (actions) executed: {} ({} ops/s)".
                         format(results['timesteps_executed'], results['ops_per_second']))
        # Total env-timesteps done (including action repeats) (and env-timesteps/sec).
        self.logger.info("Env frames executed (incl. action repeats): {} ({} frames/s)".
                         format(results['env_frames'], results['env_frames_per_second']))
        # Total episodes done (and episodes/min).
        self.logger.info("Episodes finished: {} ({} episodes/min)".
                         format(results['episodes_executed'], results['episodes_per_minute']))
        #self.logger.info("Mean episode runtime: {}s".format(results['mean_episode_runtime']))
        #self.logger.info("Mean episode reward: {}".format(results['mean_episode_reward']))
        #self.logger.info("Max. episode reward: {}".format(results['max_episode_reward']))
        #self.logger.info("Final episode reward: {}".format(results['final_episode_reward']))

        return results
Example #13
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 worker_sample_size=100,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.

        Keyword Args:
            type (str): One of "single", "actor" or "learner". Default: "single".
        """
        type_ = kwargs.pop("type", "single")
        assert type_ in ["single", "actor", "learner"]
        self.type = type_
        self.worker_sample_size = worker_sample_size

        # Network-spec by default is a "large architecture" IMPALA network.
        self.network_spec = kwargs.pop(
            "network_spec",
            dict(
                type=
                "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork"
                .format("Large" if architecture == "large" else "Small")))
        if isinstance(self.network_spec, dict) and "type" in self.network_spec and \
                "IMPALANetwork" in self.network_spec["type"]:
            self.network_spec = default_dict(
                self.network_spec,
                dict(worker_sample_size=1 if self.type ==
                     "actor" else self.worker_sample_size + 1))

        # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need.
        self.exploration_spec = kwargs.pop("exploration_spec", None)
        optimizer_spec = kwargs.pop("optimizer_spec", None)
        observe_spec = kwargs.pop("observe_spec", None)

        self.feed_previous_action_through_nn = feed_previous_action_through_nn
        self.feed_previous_reward_through_nn = feed_previous_reward_through_nn

        # Run everything in a single process.
        if self.type == "single":
            environment_spec = environment_spec or self.default_environment_spec
            update_spec = kwargs.pop("update_spec", None)
        # Actors won't need to learn (no optimizer needed in graph).
        elif self.type == "actor":
            optimizer_spec = None
            update_spec = kwargs.pop("update_spec", dict(do_updates=False))
            environment_spec = environment_spec or self.default_environment_spec
        # Learners won't need to explore (act) or observe (insert into Queue).
        else:
            observe_spec = None
            update_spec = kwargs.pop("update_spec", None)
            environment_spec = None

        # Add previous-action/reward preprocessors to env-specific preprocessor spec.
        # TODO: remove this empty hard-coded preprocessor.
        self.preprocessing_spec = kwargs.pop(
            "preprocessing_spec",
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # Flatten actions.
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=kwargs.get(
                                 "action_space").num_categories)
                    ],
                    # Bump reward and convert to float32, so that it can be concatenated by the Concat layer.
                    previous_reward=[dict(type="reshape", new_shape=(1, ))])))

        # Limit communication in distributed mode between each actor and the learner (never between actors).
        execution_spec = kwargs.pop("execution_spec", None)
        if execution_spec is not None and execution_spec.get(
                "mode") == "distributed":
            default_dict(
                execution_spec["session_config"],
                dict(type="monitored-training-session",
                     allow_soft_placement=True,
                     device_filters=["/job:learner/task:0"] + ([
                         "/job:actor/task:{}".format(
                             execution_spec["distributed_spec"]["task_index"])
                     ] if self.type == "actor" else ["/job:learner/task:0"])))
            # If Actor, make non-chief in either case (even if task idx == 0).
            if self.type == "actor":
                execution_spec["distributed_spec"]["is_chief"] = False
                # Hard-set device to the CPU for actors.
                execution_spec["device_strategy"] = "custom"
                execution_spec[
                    "default_device"] = "/job:{}/task:{}/cpu".format(
                        self.type,
                        execution_spec["distributed_spec"]["task_index"])

        self.policy_spec = kwargs.pop("policy_spec", dict())
        # TODO: Create some auto-setting based on LSTM inside the NN.
        default_dict(
            self.policy_spec,
            dict(type="shared-value-function-policy",
                 deterministic=False,
                 reuse_variable_scope="shared-policy",
                 action_space=kwargs.get("action_space")))

        # Now that we fixed the Agent's spec, call the super constructor.
        super(IMPALAAgent,
              self).__init__(discount=discount,
                             preprocessing_spec=self.preprocessing_spec,
                             network_spec=self.network_spec,
                             policy_spec=self.policy_spec,
                             exploration_spec=self.exploration_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             execution_spec=execution_spec,
                             name=kwargs.pop(
                                 "name", "impala-{}-agent".format(self.type)),
                             **kwargs)
        # Always use 1st learner as the parameter server for all policy variables.
        if self.execution_spec["mode"] == "distributed" and self.execution_spec[
                "distributed_spec"]["cluster_spec"]:
            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu")))

        # Check whether we have an RNN.
        self.has_rnn = self.policy.neural_network.has_rnn()
        # Check, whether we are running with GPU.
        self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \
            self.execution_spec["gpu_spec"]["num_gpus"] > 0

        # Some FIFO-queue specs.
        self.fifo_queue_keys = ["terminals", "states"] + \
                               (["actions"] if not self.feed_previous_action_through_nn else []) + \
                               (["rewards"] if not self.feed_previous_reward_through_nn else []) + \
                               ["action_probs"] + \
                               (["initial_internal_states"] if self.has_rnn else [])
        # Define FIFO record space.
        # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain
        # num-steps items.
        self.fifo_record_space = Dict(
            {
                "terminals":
                bool,
                "action_probs":
                FloatBox(shape=(self.action_space.num_categories, )),
            },
            add_batch_rank=False,
            add_time_rank=self.worker_sample_size)
        self.fifo_record_space["states"] = self.state_space.with_time_rank(
            self.worker_sample_size + 1)
        # Add action and rewards to state or do they have an extra channel?
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_action"] = \
                self.action_space.with_time_rank(self.worker_sample_size + 1)
        else:
            self.fifo_record_space[
                "actions"] = self.action_space.with_time_rank(
                    self.worker_sample_size)
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_reward"] = FloatBox(
                add_time_rank=self.worker_sample_size + 1)
        else:
            self.fifo_record_space["rewards"] = FloatBox(
                add_time_rank=self.worker_sample_size)

        if self.has_rnn:
            self.fifo_record_space[
                "initial_internal_states"] = self.internal_states_space.with_time_rank(
                    False)

        # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue).
        self.fifo_queue = FIFOQueue.from_spec(
            fifo_queue_spec or dict(capacity=1),
            reuse_variable_scope="shared-fifo-queue",
            only_insert_single_records=True,
            record_space=self.fifo_record_space,
            device="/job:learner/task:0/cpu"
            if self.execution_spec["mode"] == "distributed"
            and self.execution_spec["distributed_spec"]["cluster_spec"] else
            None)

        # Remove `states` key from input_spaces: not needed.
        del self.input_spaces["states"]

        # Add all our sub-components to the core.
        if self.type == "single":
            pass

        elif self.type == "actor":
            # No learning, no loss function.
            self.loss_function = None
            # A Dict Splitter to split things from the EnvStepper.
            self.env_output_splitter = ContainerSplitter(
                tuple_length=4, scope="env-output-splitter")

            self.states_dict_splitter = None

            # Slice some data from the EnvStepper (e.g only first internal states are needed).
            self.internal_states_slicer = Slice(scope="internal-states-slicer",
                                                squeeze=True)
            # Merge back to insert into FIFO.
            self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

            # Dummy Flattener to calculate action-probs space.
            dummy_flattener = ReShape(
                flatten=True,
                flatten_categories=self.action_space.num_categories)
            self.environment_stepper = EnvironmentStepper(
                environment_spec=environment_spec,
                actor_component_spec=ActorComponent(self.preprocessor,
                                                    self.policy,
                                                    self.exploration),
                state_space=self.state_space.with_batch_rank(),
                reward_space=
                float,  # TODO <- float64 for deepmind? may not work for other envs
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_previous_action_to_state=True,
                add_previous_reward_to_state=True,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space))
            sub_components = [
                self.environment_stepper, self.env_output_splitter,
                self.internal_states_slicer, self.fifo_input_merger,
                self.fifo_queue
            ]
        # Learner.
        else:
            self.environment_stepper = None

            # A Dict splitter to split up items from the queue.
            self.fifo_input_merger = None
            self.fifo_output_splitter = ContainerSplitter(
                *self.fifo_queue_keys, scope="fifo-output-splitter")
            self.states_dict_splitter = ContainerSplitter(
                *list(self.fifo_record_space["states"].keys()),
                scope="states-dict-splitter")
            self.internal_states_slicer = None

            self.transposer = Transpose(
                scope="transposer", device=dict(ops="/job:learner/task:0/cpu"))
            self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

            # Create an IMPALALossFunction with some parameters.
            self.loss_function = IMPALALossFunction(
                discount=self.discount,
                weight_pg=weight_pg,
                weight_baseline=weight_baseline,
                weight_entropy=weight_entropy,
                slice_actions=self.feed_previous_action_through_nn,
                slice_rewards=self.feed_previous_reward_through_nn,
                device="/job:learner/task:0/gpu")

            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu",
                                 ops="/job:learner/task:0/gpu")))
            for component in [
                    self.staging_area, self.preprocessor, self.optimizer
            ]:
                component.propagate_sub_component_properties(
                    dict(device="/job:learner/task:0/gpu"))

            sub_components = [
                self.fifo_output_splitter, self.fifo_queue,
                self.states_dict_splitter, self.transposer, self.staging_area,
                self.preprocessor, self.policy, self.loss_function,
                self.optimizer
            ]

        if self.type != "single":
            # Add all the agent's sub-components to the root.
            self.root_component.add_components(*sub_components)

            # Define the Agent's (root Component's) API.
            self.define_graph_api(*sub_components)

        if self.type != "single" and self.auto_build:
            if self.type == "learner":
                build_options = dict(
                    build_device_context="/job:learner/task:0/cpu",
                    pin_global_variable_device="/job:learner/task:0/cpu")
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=build_options)
            else:
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=None)

            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))

                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
            if self.type == "actor":
                self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \
                    out_op_columns[0].op_records[0].op
    def _execute(self,
                 num_timesteps=None,
                 num_episodes=None,
                 max_timesteps_per_episode=None,
                 use_exploration=True,
                 update_spec=None,
                 frameskip=None,
                 reset=True):
        """
        Actual implementation underlying `execute_timesteps` and `execute_episodes`.

        Args:
            num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or
                `num_episodes` must be provided.
            num_episodes (Optional[int]): The maximum number of episodes to run. At least one of `num_timesteps` or
                `num_episodes` must be provided.
            use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based)
                when picking actions. Default: True.
            max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode.
                Use None or 0 for no limit. Default: None.
            update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts.
                Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict.
                See `input_parsing/parse_update_spec.py` for more details.
            frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent.
                Rewards are accumulated over the number of skips. Use None for the Worker's default value.
            reset (bool): Whether to reset the environment and all the Worker's internal counters.
                Default: True.

        Returns:
            dict: Execution statistics.
        """
        assert num_timesteps is not None or num_episodes is not None,\
            "ERROR: One of `num_timesteps` or `num_episodes` must be provided!"
        # Are we updating or just acting/observing?
        update_spec = default_dict(update_spec, self.agent.update_spec)
        self.set_update_schedule(update_spec)

        num_timesteps = num_timesteps or 0
        num_episodes = num_episodes or 0
        max_timesteps_per_episode = [
            max_timesteps_per_episode or 0
            for _ in range_(self.num_environments)
        ]
        frameskip = frameskip or self.frameskip

        # Stats.
        timesteps_executed = 0
        episodes_executed = 0

        start = time.perf_counter()
        episode_terminals = self.episode_terminals
        if reset is True:
            self.env_frames = 0
            self.finished_episode_rewards = [
                [] for _ in range_(self.num_environments)
            ]
            self.finished_episode_durations = [
                [] for _ in range_(self.num_environments)
            ]
            self.finished_episode_timesteps = [
                [] for _ in range_(self.num_environments)
            ]

            for i, env_id in enumerate(self.env_ids):
                self.episode_returns[i] = 0
                self.episode_timesteps[i] = 0
                self.episode_terminals[i] = False
                self.episode_starts[i] = time.perf_counter()
                if self.worker_executes_preprocessing:
                    self.state_is_preprocessed[env_id] = False

            self.env_states = self.vector_env.reset_all()
            self.agent.reset()
        elif self.env_states[0] is None:
            raise RLGraphError(
                "Runner must be reset at the very beginning. Environment is in invalid state."
            )

        # Only run everything for at most num_timesteps (if defined).
        env_states = self.env_states
        while not (0 < num_timesteps <= timesteps_executed):
            if self.render:
                # This renders the first underlying environment.
                self.vector_env.render()

            if self.worker_executes_preprocessing:
                for i, env_id in enumerate(self.env_ids):
                    state = self.agent.state_space.force_batch(env_states[i])
                    if self.preprocessors[env_id] is not None:
                        if self.state_is_preprocessed[env_id] is False:
                            self.preprocessed_states_buffer[
                                i] = self.preprocessors[env_id].preprocess(
                                    state)
                            self.state_is_preprocessed[env_id] = True
                    else:
                        self.preprocessed_states_buffer[i] = env_states[i]
                # TODO extra returns when worker is not applying preprocessing.
                actions = self.agent.get_action(
                    states=self.preprocessed_states_buffer,
                    use_exploration=use_exploration,
                    apply_preprocessing=self.apply_preprocessing)
                preprocessed_states = np.array(self.preprocessed_states_buffer)
            else:
                preprocessed_states, actions = self.agent.get_action(
                    states=np.array(env_states),
                    use_exploration=use_exploration,
                    apply_preprocessing=True,
                    extra_returns="preprocessed_states")

            # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip.
            env_rewards = [0 for _ in range_(self.num_environments)]
            next_states = None
            for _ in range_(frameskip):
                next_states, step_rewards, episode_terminals, infos = self.vector_env.step(
                    actions=actions)

                self.env_frames += self.num_environments
                for i, step_reward in enumerate(step_rewards):
                    env_rewards[i] += step_reward
                if np.any(episode_terminals):
                    break

            # Only render once per action.
            if self.render:
                self.vector_env.environments[0].render()

            for i, env_id in enumerate(self.env_ids):
                self.episode_returns[i] += env_rewards[i]
                self.episode_timesteps[i] += 1

                if 0 < max_timesteps_per_episode[i] <= self.episode_timesteps[
                        i]:
                    episode_terminals[i] = True
                if self.worker_executes_preprocessing:
                    self.state_is_preprocessed[env_id] = False
                # Do accounting for finished episodes.
                if episode_terminals[i]:
                    episodes_executed += 1
                    episode_duration = time.perf_counter(
                    ) - self.episode_starts[i]
                    self.finished_episode_rewards[i].append(
                        self.episode_returns[i])
                    self.finished_episode_durations[i].append(episode_duration)
                    self.finished_episode_timesteps[i].append(
                        self.episode_timesteps[i])

                    self.log_finished_episode(
                        reward=self.episode_returns[i],
                        duration=episode_duration,
                        timesteps=self.episode_timesteps[i],
                        env_num=i)

                    # Reset this environment and its preprocecssor stack.
                    env_states[i] = self.vector_env.reset(i)
                    if self.worker_executes_preprocessing and self.preprocessors[
                            env_id] is not None:
                        self.preprocessors[env_id].reset()
                        # This re-fills the sequence with the reset state.
                        state = self.agent.state_space.force_batch(
                            env_states[i])
                        # Pre - process, add to buffer
                        self.preprocessed_states_buffer[i] = np.array(
                            self.preprocessors[env_id].preprocess(state))
                        self.state_is_preprocessed[env_id] = True

                    self.episode_returns[i] = 0
                    self.episode_timesteps[i] = 0
                    self.episode_starts[i] = time.perf_counter()
                else:
                    # Otherwise assign states to next states
                    env_states[i] = next_states[i]

                if self.worker_executes_preprocessing and self.preprocessors[
                        env_id] is not None:
                    next_state = self.agent.state_space.force_batch(
                        env_states[i])
                    next_states[i] = np.array(
                        self.preprocessors[env_id].preprocess(next_state))
                # TODO: If worker does not execute preprocessing, next state is not preprocessed here.
                # Observe per environment.
                self.agent.observe(preprocessed_states=preprocessed_states[i],
                                   actions=actions[i],
                                   internals=[],
                                   rewards=env_rewards[i],
                                   next_states=next_states[i],
                                   terminals=episode_terminals[i],
                                   env_id=self.env_ids[i])
            self.update_if_necessary()
            timesteps_executed += self.num_environments
            num_timesteps_reached = (0 < num_timesteps <= timesteps_executed)

            if 0 < num_episodes <= episodes_executed or num_timesteps_reached:
                break

        total_time = (time.perf_counter() - start) or 1e-10

        # Return values for current episode(s) if None have been completed.
        if episodes_executed == 0:
            mean_episode_runtime = 0
            mean_episode_reward = np.mean(self.episode_returns)
            max_episode_reward = np.max(self.episode_returns)
            final_episode_reward = self.episode_returns[0]
        else:
            all_finished_durations = []
            all_finished_rewards = []
            for i in range_(self.num_environments):
                all_finished_rewards.extend(self.finished_episode_rewards[i])
                all_finished_durations.extend(
                    self.finished_episode_durations[i])
            mean_episode_runtime = np.mean(all_finished_durations)
            mean_episode_reward = np.mean(all_finished_rewards)
            max_episode_reward = np.max(all_finished_rewards)
            final_episode_reward = all_finished_rewards[-1]

        self.episode_terminals = episode_terminals
        self.env_states = env_states
        results = dict(
            runtime=total_time,
            # Agent act/observe throughput.
            timesteps_executed=timesteps_executed,
            ops_per_second=(timesteps_executed / total_time),
            # Env frames including action repeats.
            env_frames=self.env_frames,
            env_frames_per_second=(self.env_frames / total_time),
            episodes_executed=episodes_executed,
            episodes_per_minute=(episodes_executed / (total_time / 60)),
            mean_episode_runtime=mean_episode_runtime,
            mean_episode_reward=mean_episode_reward,
            max_episode_reward=max_episode_reward,
            final_episode_reward=final_episode_reward)

        # Total time of run.
        self.logger.info("Finished execution in {} s".format(total_time))
        # Total (RL) timesteps (actions) done (and timesteps/sec).
        self.logger.info("Time steps (actions) executed: {} ({} ops/s)".format(
            results['timesteps_executed'], results['ops_per_second']))
        # Total env-timesteps done (including action repeats) (and env-timesteps/sec).
        self.logger.info(
            "Env frames executed (incl. action repeats): {} ({} frames/s)".
            format(results['env_frames'], results['env_frames_per_second']))
        # Total episodes done (and episodes/min).
        self.logger.info("Episodes finished: {} ({} episodes/min)".format(
            results['episodes_executed'], results['episodes_per_minute']))
        self.logger.info("Mean episode runtime: {}s".format(
            results['mean_episode_runtime']))
        self.logger.info("Mean episode reward: {}".format(
            results['mean_episode_reward']))
        self.logger.info("Max. episode reward: {}".format(
            results['max_episode_reward']))
        self.logger.info("Final episode reward: {}".format(
            results['final_episode_reward']))

        return results
Example #15
0
    def from_spec(cls, spec=None, **kwargs):
        """
        Uses the given spec to create an object.
        If `spec` is a dict, an optional "type" key can be used as a "constructor hint" to specify a certain class
        of the object.
        If `spec` is not a dict, `spec`'s value is used directly as the "constructor hint".

        The rest of `spec` (if it's a dict) will be used as kwargs for the (to-be-determined) constructor.
        Additional keys in **kwargs will always have precedence (overwrite keys in `spec` (if a dict)).
        Also, if the spec-dict or **kwargs contains the special key "_args", it will be popped from the dict
        and used as *args list to be passed separately to the constructor.

        The following constructor hints are valid:
        - None: Use `cls` as constructor.
        - An already instantiated object: Will be returned as is; no constructor call.
        - A string or an object that is a key in `cls`'s `__lookup_classes__` dict: The value in `__lookup_classes__`
            for that key will be used as the constructor.
        - A python callable: Use that as constructor.
        - A string: Either a json filename or the name of a python module+class (e.g. "rlgraph.components.Component")
            to be Will be used to

        Args:
            spec (Optional[dict]): The specification dict.

        Keyword Args:
            kwargs (any): Optional possibility to pass the c'tor arguments in here and use spec as the type-only info.
                Then we can call this like: from_spec([type]?, [**kwargs for ctor])
                If `spec` is already a dict, then `kwargs` will be merged with spec (overwriting keys in `spec`) after
                "type" has been popped out of `spec`.
                If a constructor of a Specifiable needs an *args list of items, the special key `_args` can be passed
                inside `kwargs` with a list type value (e.g. kwargs={"_args": [arg1, arg2, arg3]}).

        Returns:
            The object generated from the spec.
        """
        # specifiable_type is already a created object of this class -> Take it as is.
        if isinstance(spec, cls):
            return spec

        # `specifiable_type`: Indicator for the Specifiable's constructor.
        # `ctor_args`: *args arguments for the constructor.
        # `ctor_kwargs`: **kwargs arguments for the constructor.
        # Copy so caller can reuse safely.
        spec = deepcopy(spec)
        if isinstance(spec, dict):
            if "type" in spec:
                specifiable_type = spec.pop("type", None)
            else:
                specifiable_type = None
            ctor_kwargs = spec
            ctor_kwargs.update(kwargs)  # give kwargs priority
        else:
            specifiable_type = spec
            ctor_kwargs = kwargs
        # Special `_args` field in kwargs for *args-utilizing constructors.
        ctor_args = ctor_kwargs.pop("_args", [])

        # Figure out the actual constructor (class) from `type_`.
        # None: Try __default__object (if no args/kwargs), only then constructor of cls (using args/kwargs).
        if specifiable_type is None:
            # We have a default constructor that was defined directly by cls (not by its children).
            if cls.__default_constructor__ is not None and ctor_args == [] and \
                    (not hasattr(cls.__bases__[0], "__default_constructor__") or
                     cls.__bases__[0].__default_constructor__ is None or
                     cls.__bases__[0].__default_constructor__ is not cls.__default_constructor__
                    ):
                constructor = cls.__default_constructor__
                # Default partial's keywords into ctor_kwargs.
                if isinstance(constructor, partial):
                    kwargs = default_dict(ctor_kwargs, constructor.keywords)
                    constructor = partial(constructor.func, **kwargs)
                    ctor_kwargs = {} # erase to avoid duplicate kwarg error
            # Try our luck with this class itself.
            else:
                constructor = cls
        # Try the __lookup_classes__ of this class.
        else:
            constructor = cls.lookup_class(specifiable_type)

            # Found in cls.__lookup_classes__.
            if constructor is not None:
                pass
            # Python callable.
            elif callable(specifiable_type):
                constructor = specifiable_type
            # A string: Filename or a python module+class.
            elif isinstance(specifiable_type, str):
                if re.search(r'\.(yaml|yml|json)$', specifiable_type):
                    return cls.from_file(specifiable_type, *ctor_args, **ctor_kwargs)
                elif specifiable_type.find('.') != -1:
                    module_name, function_name = specifiable_type.rsplit(".", 1)
                    module = importlib.import_module(module_name)
                    constructor = getattr(module, function_name)
                else:
                    raise RLGraphError(
                        "ERROR: String specifier ({}) in from_spec must be a filename, a module+class, or a key "
                        "into {}.__lookup_classes__!".format(specifiable_type, cls.__name__)
                    )

        if not constructor:
            raise RLGraphError("Invalid type: {}".format(specifiable_type))

        # Create object with inferred constructor.
        specifiable_object = constructor(*ctor_args, **ctor_kwargs)
        assert isinstance(specifiable_object, constructor.func if isinstance(constructor, partial) else constructor)

        return specifiable_object
Example #16
0
def parse_execution_spec(execution_spec):
    """
    Parses execution parameters and inserts default values where necessary.

    Args:
        execution_spec (Optional[dict]): Execution spec dict. Must specify an execution mode
            "single" or "distributed". If mode "distributed", must specify a "distributed_spec"
            containing:
             - a key cluster_spec mapping to a ClusterSpec object,
             - a "job" for the job name,
             - an integer "task_index"

    Returns:
        dict: The sanitized execution_spec dict.
    """
    # TODO these are tensorflow specific
    # If no spec given.
    if get_backend() == "tf":
        default_spec = dict(
            mode="single",
            distributed_spec=None,
            # Using a monitored session enabling summaries and hooks per default.
            disable_monitoring=False,
            # Gpu settings.
            gpu_spec=dict(
                # Are GPUs allowed to be used if they are detected?
                gpus_enabled=False,
                # If yes, how many GPUs are to be used?
                max_usable_gpus=0,
                # If True, use `max_usable_gpus` fake-GPUs (CPU) iff no GPUs are available.
                fake_gpus_if_necessary=False,
                # Specify specific CUDA devices to be used, e.g. gpu 0 and 2 = [0, 2].
                # If None, we use CUDA devices [0, max_usable_gpus - 1]
                cuda_devices=None,
                # Fraction of the overall amount of memory that each visible GPU should be allocated.
                per_process_gpu_memory_fraction=None,
                # If True, not all memory will be allocated which is relevant on shared resources.
                allow_memory_growth=False),
            # Device placement settings.
            device_strategy="default",
            default_device=None,
            device_map={},
            session_config=None,
            # Random seed for the tf graph.
            seed=None,
            # Enabling the tf profiler?
            enable_profiler=False,
            # With which frequency do we print out profiler information?
            profiler_frequency=1000,
            # Enabling a timeline write?
            enable_timeline=False,
            # With which frequency do we write out a timeline file?
            timeline_frequency=1,
        )
        execution_spec = default_dict(execution_spec, default_spec)

        # Sub specifications:

        # Distributed specifications.
        if execution_spec.get("mode") == "distributed":
            default_distributed = dict(job="ps",
                                       task_index=0,
                                       cluster_spec=dict(
                                           ps=["localhost:22222"],
                                           worker=["localhost:22223"]),
                                       protocol=None)
            execution_spec["distributed_spec"] = default_dict(
                execution_spec.get("distributed_spec"), default_distributed)

        # Session config.
        default_session_config = dict(type="monitored-training-session",
                                      allow_soft_placement=True,
                                      log_device_placement=False)
        execution_spec["session_config"] = default_dict(
            execution_spec.get("session_config"), default_session_config)
    elif get_backend() == "pytorch":
        # No session configs, different GPU options.
        default_spec = dict(
            mode="single",
            distributed_spec=None,
            # Using a monitored session enabling summaries and hooks per default.
            disable_monitoring=False,
            # Gpu settings.
            gpu_spec=dict(
                # Are GPUs allowed to be used if they are detected?
                gpus_enabled=False,
                # If yes, how many GPUs are to be used?
                max_usable_gpus=0,
                # Specify specific CUDA devices to be used, e.g. gpu 0 and 2 = [0, 2].
                # If None, we use CUDA devices [0, max_usable_gpus - 1]
                cuda_devices=None),
            # Device placement settings.
            device_strategy="default",
            default_device=None,
            device_map={},
            # TODO potentially set to nproc?
            torch_num_threads=1,
            OMP_NUM_THREADS=1)
        execution_spec = default_dict(execution_spec, default_spec)

    return execution_spec
Example #17
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph.utils.util import default_dict

# Basics.
from rlgraph.components.layers.layer import Layer
# Preprocessing Layers.
from rlgraph.components.layers.preprocessing import *
# NN-Layers.
from rlgraph.components.layers.nn import *
# String Layers.
from rlgraph.components.layers.strings import *

# The Layers (Layers are also Stacks).
Layer.__lookup_classes__ = dict(nnlayer=NNLayer,
                                preprocesslayer=PreprocessLayer)
# Add all specific Layer sub-classes to this one.
default_dict(Layer.__lookup_classes__, NNLayer.__lookup_classes__)
default_dict(Layer.__lookup_classes__, PreprocessLayer.__lookup_classes__)
default_dict(Layer.__lookup_classes__, StringLayer.__lookup_classes__)


__all__ = ["Layer"] + \
          list(set(map(lambda x: x.__name__, Layer.__lookup_classes__.values())))
Example #18
0
    def _execute(self,
                 num_timesteps=None,
                 num_episodes=None,
                 max_timesteps_per_episode=None,
                 use_exploration=True,
                 update_spec=None,
                 frameskip=None,
                 reset=True):
        """
        Actual implementation underlying `execute_timesteps` and `execute_episodes`.

        Args:
            num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or
                `num_episodes` must be provided.
            num_episodes (Optional[int]): The maximum number of episodes to run. At least one of `num_timesteps` or
                `num_episodes` must be provided.
            use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based)
                when picking actions. Default: True.
            max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode.
                Use None or 0 for no limit. Default: None.
            update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts.
                Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict.
                See `input_parsing/parse_update_spec.py` for more details.
            frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent.
                Rewards are accumulated over the number of skips. Use None for the Worker's default value.
            reset (bool): Whether to reset the environment and all the Worker's internal counters.
                Default: True.

        Returns:
            dict: Execution statistics.
        """
        assert num_timesteps is not None or num_episodes is not None,\
            "ERROR: One of `num_timesteps` or `num_episodes` must be provided!"
        # Are we updating or just acting/observing?
        update_spec = default_dict(update_spec, self.agent.update_spec)
        self.set_update_schedule(update_spec)

        num_timesteps = num_timesteps or 0
        num_episodes = num_episodes or 0
        max_timesteps_per_episode = [
            max_timesteps_per_episode or 0
            for _ in range_(self.num_environments)
        ]
        frameskip = frameskip or self.frameskip

        # Stats.
        timesteps_executed = 0
        episodes_executed = 0

        start = time.perf_counter()
        episode_terminals = self.episode_terminals
        if reset is True:
            self.env_frames = 0
            self.episodes_since_update = 0
            self.finished_episode_rewards = [
                [] for _ in range_(self.num_environments)
            ]
            self.finished_episode_durations = [
                [] for _ in range_(self.num_environments)
            ]
            self.finished_episode_timesteps = [
                [] for _ in range_(self.num_environments)
            ]

            for i, env_id in enumerate(self.env_ids):
                self.episode_returns[i] = 0
                self.episode_timesteps[i] = 0
                self.episode_terminals[i] = False
                self.episode_starts[i] = time.perf_counter()
                if self.worker_executes_preprocessing:
                    self.state_is_preprocessed[env_id] = False

            self.env_states = self.vector_env.reset_all()
            self.agent.reset()
        elif self.env_states[0] is None:
            raise RLGraphError(
                "Runner must be reset at the very beginning. Environment is in invalid state."
            )

        # Only run everything for at most num_timesteps (if defined).
        env_states = self.env_states
        while not (0 < num_timesteps <= timesteps_executed):
            if self.render:
                self.vector_env.render()

            if self.worker_executes_preprocessing:
                for i, env_id in enumerate(self.env_ids):
                    state = self.agent.state_space.force_batch(env_states[i])
                    if self.preprocessors[env_id] is not None:
                        if self.state_is_preprocessed[env_id] is False:
                            self.preprocessed_states_buffer[
                                i] = self.preprocessors[env_id].preprocess(
                                    state)
                            self.state_is_preprocessed[env_id] = True
                    else:
                        self.preprocessed_states_buffer[i] = env_states[i]
                # TODO extra returns when worker is not applying preprocessing.
                actions = self.agent.get_action(
                    states=self.preprocessed_states_buffer,
                    use_exploration=use_exploration,
                    apply_preprocessing=self.apply_preprocessing)
                preprocessed_states = np.array(self.preprocessed_states_buffer)
            else:
                actions, preprocessed_states = self.agent.get_action(
                    states=np.array(env_states),
                    use_exploration=use_exploration,
                    apply_preprocessing=True,
                    extra_returns="preprocessed_states")

            # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip.
            env_rewards = [0 for _ in range_(self.num_environments)]
            next_states = None

            # For Dict action spaces, we have to treat each key as an array with batch-rank at index 0.
            # The action-dict is then translated into a list of dicts where each dict contains the original data
            # but without the batch-rank.
            # E.g. {'A': array([0, 1]), 'B': array([2, 3])} -> [{'A': 0, 'B': 2}, {'A': 1, 'B': 3}]
            if isinstance(self.agent.action_space, Dict):
                some_key = next(iter(actions))
                assert isinstance(actions, dict) and isinstance(actions[some_key], np.ndarray),\
                    "ERROR: Cannot flip Dict-action batch with dict keys if returned value is not a dict OR " \
                    "values of returned value are not np.ndarrays!"
                # TODO: What if actions come as nested dicts (more than one level deep)?
                # TODO: Use DataOpDict/Tuple's new `map` method.
                if hasattr(actions[some_key], "__len__"):
                    env_actions = [{
                        key: value[i]
                        for key, value in actions.items()
                    } for i in range(len(actions[some_key]))]
                else:
                    # Action was not array type.
                    env_actions = [{
                        key: value
                        for key, value in actions.items()
                    }]
            # Tuple action Spaces:
            # E.g. Tuple(array([0, 1]), array([2, 3])) -> [(0, 2), (1, 3)]
            elif isinstance(self.agent.action_space, Tuple):
                assert isinstance(actions, tuple) and isinstance(actions[0], np.ndarray),\
                    "ERROR: Cannot flip tuple-action batch if returned value is not a tuple OR " \
                    "values of returned value are not np.ndarrays!"
                # TODO: Use DataOpDict/Tuple's new `map` method.
                env_actions = [
                    tuple(value[i] for _, value in enumerate(actions))
                    for i in range(len(actions[0]))
                ]
            # No container batch-flipping necessary.
            else:
                env_actions = actions
                if self.num_environments == 1 and env_actions.shape == ():
                    env_actions = [env_actions]

            for _ in range_(frameskip):
                next_states, step_rewards, episode_terminals, _ = self.vector_env.step(
                    actions=env_actions)

                self.env_frames += self.num_environments
                for i, step_reward in enumerate(step_rewards):
                    env_rewards[i] += step_reward
                if np.any(episode_terminals):
                    break

            # Only render once per action.
            #if self.render:
            #    self.vector_env.environments[0].render()

            for i, env_id in enumerate(self.env_ids):
                self.episode_returns[i] += env_rewards[i]
                self.episode_timesteps[i] += 1

                if 0 < max_timesteps_per_episode[i] <= self.episode_timesteps[
                        i]:
                    episode_terminals[i] = True
                if self.worker_executes_preprocessing:
                    self.state_is_preprocessed[env_id] = False
                # Do accounting for finished episodes.
                if episode_terminals[i]:
                    episodes_executed += 1
                    self.episodes_since_update += 1
                    episode_duration = time.perf_counter(
                    ) - self.episode_starts[i]
                    self.finished_episode_rewards[i].append(
                        self.episode_returns[i])
                    self.finished_episode_durations[i].append(episode_duration)
                    self.finished_episode_timesteps[i].append(
                        self.episode_timesteps[i])

                    self.log_finished_episode(
                        episode_return=self.episode_returns[i],
                        duration=episode_duration,
                        timesteps=self.episode_timesteps[i],
                        env_num=i)

                    # Reset this environment and its preprocecssor stack.
                    env_states[i] = self.vector_env.reset(i)
                    if self.worker_executes_preprocessing and self.preprocessors[
                            env_id] is not None:
                        self.preprocessors[env_id].reset()
                        # This re-fills the sequence with the reset state.
                        state = self.agent.state_space.force_batch(
                            env_states[i])
                        # Pre - process, add to buffer
                        self.preprocessed_states_buffer[i] = np.array(
                            self.preprocessors[env_id].preprocess(state))
                        self.state_is_preprocessed[env_id] = True

                    self.episode_returns[i] = 0
                    self.episode_timesteps[i] = 0
                    self.episode_starts[i] = time.perf_counter()
                else:
                    # Otherwise assign states to next states
                    env_states[i] = next_states[i]

                if self.worker_executes_preprocessing and self.preprocessors[
                        env_id] is not None:
                    #next_state = self.agent.state_space.force_batch(env_states[i])
                    next_states[i] = np.array(
                        self.preprocessors[env_id].preprocess(
                            env_states[i]))  # next_state
                self._observe(self.env_ids[i], preprocessed_states[i],
                              env_actions[i], env_rewards[i], next_states[i],
                              episode_terminals[i])
            self.update_if_necessary()
            timesteps_executed += self.num_environments
            num_timesteps_reached = (0 < num_timesteps <= timesteps_executed)

            if 0 < num_episodes <= episodes_executed or num_timesteps_reached:
                break

        total_time = (time.perf_counter() - start) or 1e-10

        # Return values for current episode(s) if None have been completed.
        if episodes_executed == 0:
            mean_episode_runtime = 0
            mean_episode_reward = np.mean(self.episode_returns)
            max_episode_reward = np.max(self.episode_returns)
            final_episode_reward = self.episode_returns[0]
        else:
            all_finished_durations = []
            all_finished_rewards = []
            for i in range_(self.num_environments):
                all_finished_rewards.extend(self.finished_episode_rewards[i])
                all_finished_durations.extend(
                    self.finished_episode_durations[i])
            mean_episode_runtime = np.mean(all_finished_durations)
            mean_episode_reward = np.mean(all_finished_rewards)
            max_episode_reward = np.max(all_finished_rewards)
            final_episode_reward = all_finished_rewards[-1]

        self.episode_terminals = episode_terminals
        self.env_states = env_states
        results = dict(
            runtime=total_time,
            # Agent act/observe throughput.
            timesteps_executed=timesteps_executed,
            ops_per_second=(timesteps_executed / total_time),
            # Env frames including action repeats.
            env_frames=self.env_frames,
            env_frames_per_second=(self.env_frames / total_time),
            episodes_executed=episodes_executed,
            episodes_per_minute=(episodes_executed / (total_time / 60)),
            mean_episode_runtime=mean_episode_runtime,
            mean_episode_reward=mean_episode_reward,
            max_episode_reward=max_episode_reward,
            final_episode_reward=final_episode_reward)

        # Total time of run.
        self.logger.info("Finished execution in {} s".format(total_time))
        # Total (RL) timesteps (actions) done (and timesteps/sec).
        self.logger.info("Time steps (actions) executed: {} ({} ops/s)".format(
            results['timesteps_executed'], results['ops_per_second']))
        # Total env-timesteps done (including action repeats) (and env-timesteps/sec).
        self.logger.info(
            "Env frames executed (incl. action repeats): {} ({} frames/s)".
            format(results['env_frames'], results['env_frames_per_second']))
        # Total episodes done (and episodes/min).
        self.logger.info("Episodes finished: {} ({} episodes/min)".format(
            results['episodes_executed'], results['episodes_per_minute']))
        self.logger.info("Mean episode runtime: {}s".format(
            results['mean_episode_runtime']))
        self.logger.info("Mean episode reward: {}".format(
            results['mean_episode_reward']))
        self.logger.info("Max. episode reward: {}".format(
            results['max_episode_reward']))
        self.logger.info("Final episode reward: {}".format(
            results['final_episode_reward']))

        return results