def __init__(self, *preprocessors, **kwargs): """ Args: preprocessors (PreprocessorLayer): The PreprocessorLayers to add to the Stack and connect to each other. Keyword Args: fold_time_rank (bool): Whether to fold the time rank for the `preprocess` API-method stack. unfold_time_rank (bool): Whether to unfold the time rank for the `preprocess` API-method stack. Raises: RLGraphError: If a sub-component is not a PreprocessLayer object. """ self.fold_time_rank = kwargs.get("fold_time_rank", False) self.unfold_time_rank = kwargs.get("unfold_time_rank", False) # Link sub-Components' `call` methods together to yield PreprocessorStack's `preprocess` method. # NOTE: Do not include `reset` here as it is defined explicitly below. kwargs["api_methods"] = [ dict(api="preprocess", component_api="call", fold_time_rank=self.fold_time_rank, unfold_time_rank=self.unfold_time_rank) ] default_dict(kwargs, dict(scope=kwargs.pop("scope", "preprocessor-stack"))) super(PreprocessorStack, self).__init__(*preprocessors, **kwargs)
def __init__(self, *preprocessors, **kwargs): """ Args: preprocessors (PreprocessorLayer): The PreprocessorLayers to add to the Stack and connect to each other. Raises: RLGraphError: If a sub-component is not a PreprocessLayer object. """ # Link sub-Components' `apply` methods together to yield PreprocessorStack's `preprocess` method. # NOTE: Do not include `reset` here as it is defined explicitly below. kwargs["api_methods"] = {("preprocess", "apply")} default_dict(kwargs, dict(scope=kwargs.pop("scope", "preprocessor-stack"))) super(PreprocessorStack, self).__init__(*preprocessors, **kwargs)
def test_impala_actor_compilation(self): """ Tests IMPALA agent compilation (actor). """ try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path( "configs/impala_agent_for_deepmind_lab_env.json") env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) actor_agent = IMPALAAgent.from_spec( agent_config, type="actor", state_space=dummy_env.state_space, action_space=dummy_env.action_space, internal_states_space=Tuple(FloatBox(shape=(256, )), FloatBox(shape=(256, )), add_batch_rank=False), environment_spec=default_dict(dict(type="deepmind-lab"), env_spec), # Make session-creation hang in docker. execution_spec=dict(disable_monitoring=True)) # Start Specifiable Server with Env manually (monitoring is disabled). actor_agent.environment_stepper.environment_server.start_server() print("Compiled IMPALA type=actor agent.") actor_agent.environment_stepper.environment_server.stop_server() actor_agent.terminate()
def parse_update_spec(update_spec): """ Parses update/learning parameters and inserts default values where necessary. Args: update_spec (Optional[dict]): Update/Learning spec dict. Returns: dict: The sanitized update_spec dict. """ # If no spec given. default_spec = dict( # Whether to perform calls to `Agent.update()` at all. do_updates=True, # The unit in which we measure frequency: one of "timesteps", "episodes", "sec". # unit="timesteps", # TODO: not supporting any other than timesteps # The number of 'units' to wait before we do any updating at all. steps_before_update=0, # The frequency with which we update (given in `unit`). update_interval=4, # The number of consecutive `Agent.update()` calls per update. update_steps=1, # The batch size with which to update (e.g. when pulling records from a memory). batch_size=64, sync_interval=128) update_spec = default_dict(update_spec, default_spec) return update_spec
def parse_observe_spec(observe_spec): """ Parses parameters for `Agent.observe()` calls and inserts default values where necessary. Args: observe_spec (Optional[dict]): Observe spec dict. Returns: dict: The sanitized observe_spec dict. """ # If no spec given. default_spec = dict( # Do we buffer observations in python before sending them through the graph? buffer_enabled=True, # Fill buffer with n records before sending them through the graph. buffer_size=100, # only if buffer_enabled=True # Set to > 1 if we want to post-process buffered values for n-step learning. n_step= 1, # values > 1 are only allowed if buffer_enabled is True and buffer_size >> n. ) observe_spec = default_dict(observe_spec, default_spec) if observe_spec["n_step"] > 1: if observe_spec["buffer_enabled"] is False: raise RLGraphError( "Cannot setup observations with n-step (n={}), while buffering is switched " "off".format(observe_spec["n_step"])) elif observe_spec["buffer_size"] < 3 * observe_spec["n_step"]: raise RLGraphError( "Buffer must be at least 3x as large as n-step (n={}, min-buffer={})!" .format(observe_spec["n_step"], 3 * observe_spec["n_step"])) return observe_spec
def parse_saver_spec(saver_spec): """ Parses the saver spec. Returns None if input None, otherwise provides default parameters. Args: saver_spec (Union[None, dict]): Saver parameters. Returns: Union(dict, None): Saver spec or None. """ if saver_spec is None: return None default_spec = dict( # The directory in which to store model checkpoint files. directory=os.path.expanduser( "~/rlgraph_checkpoints/"), # default=home dir # The base file name for a saved checkpoint. checkpoint_basename="model.ckpt", # How many files to maximally store for one graph. max_checkpoints=5, # Every how many seconds do we save? None if saving frequency should be step based. save_secs=600, # Every how many steps do we save? None if saving frequency should be time (seconds) based. save_steps=None) return default_dict(saver_spec, default_spec)
def test_impala_learner_compilation(self): """ Tests IMPALA agent compilation (learner). """ return if get_backend() == "pytorch": return try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path( "configs/impala_agent_for_deepmind_lab_env.json") env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) learner_agent = IMPALAAgent.from_spec( agent_config, type="learner", state_space=dummy_env.state_space, action_space=dummy_env.action_space, internal_states_space=IMPALAAgent.default_internal_states_space, environment_spec=default_dict(dict(type="deepmind-lab"), env_spec), # Setup distributed tf. execution_spec=dict( mode="distributed", #gpu_spec=dict( # gpus_enabled=True, # max_usable_gpus=1, # num_gpus=1 #), distributed_spec=dict(job="learner", task_index=0, cluster_spec=self.impala_cluster_spec), session_config=dict(type="monitored-training-session", allow_soft_placement=True, log_device_placement=True, auto_start=False), disable_monitoring=True, enable_timeline=True, )) print( "Compiled IMPALA type=learner agent without starting the session (would block waiting for actor)." ) ## Take one batch from the filled up queue and run an update_from_memory with the learner. #update_steps = 10 #time_start = time.perf_counter() #for _ in range(update_steps): # agent.call_api_method("update_from_memory") #time_total = time.perf_counter() - time_start #print("Done learning {}xbatch-of-{} in {}sec ({} updates/sec).".format( # update_steps, agent.update_spec["batch_size"], time_total , update_steps / time_total) #) learner_agent.terminate()
def __init__(self, preprocessors, **kwargs): """ Args: preprocessors (dict): Raises: RLGraphError: If a sub-component is not a PreprocessLayer object. """ # Create one separate PreprocessorStack per given key. # All possibly other keys in an input will be pass through un-preprocessed. self.flattened_preprocessors = flatten_op(preprocessors) for i, (flat_key, spec) in enumerate(self.flattened_preprocessors.items()): self.flattened_preprocessors[flat_key] = PreprocessorStack.from_spec( spec, scope="preprocessor-stack-{}".format(i) ) # NOTE: No automatic API-methods. Define them all ourselves. kwargs["api_methods"] = {} default_dict(kwargs, dict(scope=kwargs.pop("scope", "dict-preprocessor-stack"))) super(DictPreprocessorStack, self).__init__(*list(self.flattened_preprocessors.values()), **kwargs)
def __init__(self, level_id, observations="RGB_INTERLEAVED", actions=None, frameskip=4, config=None, renderer="software", seed=None, level_cache=None): """ Args: level_id (str): Specifier of the level to play, e.g. 'seekavoid_arena_01'. observations (Union[str,List[str]]): String specifier(s) for the observation(s) to be used with the given level. Will be converted into either a (single) BoxSpace or a Tuple (of BoxSpaces). See deepmind's documentation for all available observations. actions (Optional[List[dict]]): The RLgraph action spec (currently, only IntBox (shape=()) RLgraph action spaces are supported) that will be translated from and to the deepmind Lab actions. List slots correspond to the single int-actions, list items are dicts with: key=deepmind Lab partial action name e.g. LOOK_LEFT_RIGHT_PIXELS_PER_FRAME. value=the value for that deepmind Lab partial action e.g. -100. frameskip (Optional[Tuple[int,int],int]): How many frames should be skipped with (repeated action and accumulated reward). E.g. (2,5) -> Uniformly pull from set [2,3,4]. Default: 4. config (Optional[dict]): The `config` parameter to be passed into the Lab's constructor. Supports 'width', 'height', 'fps', and other useful parameters. Values must be given as string values. e.g. dict(width='96') renderer (str): The `renderer` parameter to be passed into the Lab's constructor. seed (Optional[int]): An optional seed to use to initialize a numpy random state object, which is then used to seed all occurring resets in a deterministic fashion. level_cache (Optional[object]): An optional custom level caching object to help increase performance when playing many repeating levels. Will be passed as is into the Lab's constructor. """ # Create the wrapped deepmind lab level object. self.level_id = level_id observations = force_list(observations) config = default_dict(config, dict(width='96', height='72', fps='60')) # Default config. self.level = deepmind_lab.Lab(self.level_id, observations, config=config, renderer=renderer, level_cache=level_cache) # Dict mapping a discrete action (int) - we don't support continuous actions yet - into a # deepmind Lab action vector. self.action_list, action_space = self.define_actions(actions) observation_space = self.define_observations(observations) super(DeepmindLabEnv, self).__init__(observation_space, action_space) self.frameskip = frameskip self.random_state = np.random.RandomState( seed=seed or int(time.time())) self.reset()
def parse_summary_spec(summary_spec): """ Expands summary spec with default values where necessary. Args: summary_spec (dict): Summary options. Returns: dict: Summary spec updated with default values. """ default_spec = dict( # The directory in which to store the summary files. directory=os.path.expanduser( "~/rlgraph_summaries/"), # default=home dir # A regexp pattern that a summary op (including its global scope) has to match in order for it to # be included in the graph's summaries. summary_regexp="", # Every how many seconds do we save a summary? None if saving frequency should be step based. save_secs=120, # Every how many steps do we save a summary? None if saving frequency should be time (seconds) based. save_steps=None) return default_dict(summary_spec, default_spec)
# Core. from rlgraph.components.component import Component # Component child-classes. from rlgraph.components.distributions import * from rlgraph.components.explorations import Exploration, EpsilonExploration from rlgraph.components.layers import * from rlgraph.components.loss_functions import * from rlgraph.components.memories import * from rlgraph.components.neural_networks import * from rlgraph.components.optimizers import * from rlgraph.components.policies import * from rlgraph.components.common import * from rlgraph.utils.util import default_dict Component.__lookup_classes__ = dict() # Add all specific sub-classes to this one. default_dict(Component.__lookup_classes__, Distribution.__lookup_classes__) default_dict(Component.__lookup_classes__, Layer.__lookup_classes__) default_dict(Component.__lookup_classes__, Stack.__lookup_classes__) default_dict(Component.__lookup_classes__, LossFunction.__lookup_classes__) default_dict(Component.__lookup_classes__, Memory.__lookup_classes__) default_dict(Component.__lookup_classes__, NeuralNetwork.__lookup_classes__) default_dict(Component.__lookup_classes__, Optimizer.__lookup_classes__) default_dict(Component.__lookup_classes__, Policy.__lookup_classes__) __all__ = ["Component"] + \ list(set(map(lambda x: x.__name__, Component.__lookup_classes__.values())))
def execute_timesteps(self, num_timesteps, max_timesteps_per_episode=0, update_spec=None, use_exploration=True, frameskip=None, reset=True): """ Args: num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or `num_episodes` must be provided. use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based) when picking actions. Default: True. max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode. Use None or 0 for no limit. Default: None. update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts. Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict. See `input_parsing/parse_update_spec.py` for more details. frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent. Rewards are accumulated over the number of skips. Use None for the Worker's default value. reset (bool): Whether to reset the environment and all the Worker's internal counters. Default: True. Returns: dict: Execution statistics. """ # Are we updating or just acting/observing? update_spec = default_dict(update_spec, self.agent.update_spec) self.set_update_schedule(update_spec) num_timesteps = num_timesteps or 0 max_timesteps_per_episode = max_timesteps_per_episode or 0 # Stats. timesteps_executed = 0 episodes_executed = 0 start = time.perf_counter() if reset is True: self.env_frames = 0 #self.finished_episode_rewards = list() self.finished_episode_steps = list() #self.episode_returns = 0 self.episode_timesteps = 0 # TODO: Fix for vectorized Envs. self.agent.call_api_method("reset") # Only run everything for at most num_timesteps (if defined). while not (0 < num_timesteps <= timesteps_executed): # TODO right now everything comes back as single-env. out = self.agent.call_api_method(("perform_n_steps_and_insert_into_fifo", None, [0])) timesteps_executed += self.agent.worker_sample_size # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip. #rewards = out[2] terminals = out[3][1:] self.env_frames += self.frameskip * self.agent.worker_sample_size # Only render once per action. #if self.render: # self.vector_env.environments[0].render() #for i in range_(self.num_environments): # #self.episode_timesteps[i] += self.agent.worker_sample_size for j, terminal in enumerate(terminals): # TODO: <- [i] self.episode_timesteps += 1 if 0 < max_timesteps_per_episode <= self.episode_timesteps: terminal = True if terminal: episodes_executed += 1 self.finished_episode_steps.append(self.episode_timesteps) self.logger.info("Finished episode: actions={}.".format(self.episode_timesteps)) self.episode_timesteps = 0 num_timesteps_reached = (0 < num_timesteps <= timesteps_executed) if num_timesteps_reached: break total_time = (time.perf_counter() - start) or 1e-10 # Return values for current episode(s) if None have been completed. #if len(self.finished_episode_rewards) == 0: # #mean_episode_runtime = 0 # mean_episode_reward = np.mean(self.episode_returns) # max_episode_reward = np.max(self.episode_returns) # final_episode_reward = self.episode_returns[0] #else: # #mean_episode_runtime = np.mean(self.finished_episode_durations) # mean_episode_reward = np.mean(self.finished_episode_rewards) # max_episode_reward = np.max(self.finished_episode_rewards) # final_episode_reward = self.finished_episode_rewards[-1] results = dict( runtime=total_time, # Agent act/observe throughput. timesteps_executed=timesteps_executed, ops_per_second=(timesteps_executed / total_time), # Env frames including action repeats. env_frames=self.env_frames, env_frames_per_second=(self.env_frames / total_time), episodes_executed=episodes_executed, episodes_per_minute=(episodes_executed/(total_time / 60)), #mean_episode_runtime=mean_episode_runtime, #mean_episode_reward=mean_episode_reward, #max_episode_reward=max_episode_reward, #final_episode_reward=final_episode_reward ) # Total time of run. self.logger.info("Finished execution in {} s".format(total_time)) # Total (RL) timesteps (actions) done (and timesteps/sec). self.logger.info("Time steps (actions) executed: {} ({} ops/s)". format(results['timesteps_executed'], results['ops_per_second'])) # Total env-timesteps done (including action repeats) (and env-timesteps/sec). self.logger.info("Env frames executed (incl. action repeats): {} ({} frames/s)". format(results['env_frames'], results['env_frames_per_second'])) # Total episodes done (and episodes/min). self.logger.info("Episodes finished: {} ({} episodes/min)". format(results['episodes_executed'], results['episodes_per_minute'])) #self.logger.info("Mean episode runtime: {}s".format(results['mean_episode_runtime'])) #self.logger.info("Mean episode reward: {}".format(results['mean_episode_reward'])) #self.logger.info("Max. episode reward: {}".format(results['max_episode_reward'])) #self.logger.info("Final episode reward: {}".format(results['final_episode_reward'])) return results
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
def _execute(self, num_timesteps=None, num_episodes=None, max_timesteps_per_episode=None, use_exploration=True, update_spec=None, frameskip=None, reset=True): """ Actual implementation underlying `execute_timesteps` and `execute_episodes`. Args: num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or `num_episodes` must be provided. num_episodes (Optional[int]): The maximum number of episodes to run. At least one of `num_timesteps` or `num_episodes` must be provided. use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based) when picking actions. Default: True. max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode. Use None or 0 for no limit. Default: None. update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts. Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict. See `input_parsing/parse_update_spec.py` for more details. frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent. Rewards are accumulated over the number of skips. Use None for the Worker's default value. reset (bool): Whether to reset the environment and all the Worker's internal counters. Default: True. Returns: dict: Execution statistics. """ assert num_timesteps is not None or num_episodes is not None,\ "ERROR: One of `num_timesteps` or `num_episodes` must be provided!" # Are we updating or just acting/observing? update_spec = default_dict(update_spec, self.agent.update_spec) self.set_update_schedule(update_spec) num_timesteps = num_timesteps or 0 num_episodes = num_episodes or 0 max_timesteps_per_episode = [ max_timesteps_per_episode or 0 for _ in range_(self.num_environments) ] frameskip = frameskip or self.frameskip # Stats. timesteps_executed = 0 episodes_executed = 0 start = time.perf_counter() episode_terminals = self.episode_terminals if reset is True: self.env_frames = 0 self.finished_episode_rewards = [ [] for _ in range_(self.num_environments) ] self.finished_episode_durations = [ [] for _ in range_(self.num_environments) ] self.finished_episode_timesteps = [ [] for _ in range_(self.num_environments) ] for i, env_id in enumerate(self.env_ids): self.episode_returns[i] = 0 self.episode_timesteps[i] = 0 self.episode_terminals[i] = False self.episode_starts[i] = time.perf_counter() if self.worker_executes_preprocessing: self.state_is_preprocessed[env_id] = False self.env_states = self.vector_env.reset_all() self.agent.reset() elif self.env_states[0] is None: raise RLGraphError( "Runner must be reset at the very beginning. Environment is in invalid state." ) # Only run everything for at most num_timesteps (if defined). env_states = self.env_states while not (0 < num_timesteps <= timesteps_executed): if self.render: # This renders the first underlying environment. self.vector_env.render() if self.worker_executes_preprocessing: for i, env_id in enumerate(self.env_ids): state = self.agent.state_space.force_batch(env_states[i]) if self.preprocessors[env_id] is not None: if self.state_is_preprocessed[env_id] is False: self.preprocessed_states_buffer[ i] = self.preprocessors[env_id].preprocess( state) self.state_is_preprocessed[env_id] = True else: self.preprocessed_states_buffer[i] = env_states[i] # TODO extra returns when worker is not applying preprocessing. actions = self.agent.get_action( states=self.preprocessed_states_buffer, use_exploration=use_exploration, apply_preprocessing=self.apply_preprocessing) preprocessed_states = np.array(self.preprocessed_states_buffer) else: preprocessed_states, actions = self.agent.get_action( states=np.array(env_states), use_exploration=use_exploration, apply_preprocessing=True, extra_returns="preprocessed_states") # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip. env_rewards = [0 for _ in range_(self.num_environments)] next_states = None for _ in range_(frameskip): next_states, step_rewards, episode_terminals, infos = self.vector_env.step( actions=actions) self.env_frames += self.num_environments for i, step_reward in enumerate(step_rewards): env_rewards[i] += step_reward if np.any(episode_terminals): break # Only render once per action. if self.render: self.vector_env.environments[0].render() for i, env_id in enumerate(self.env_ids): self.episode_returns[i] += env_rewards[i] self.episode_timesteps[i] += 1 if 0 < max_timesteps_per_episode[i] <= self.episode_timesteps[ i]: episode_terminals[i] = True if self.worker_executes_preprocessing: self.state_is_preprocessed[env_id] = False # Do accounting for finished episodes. if episode_terminals[i]: episodes_executed += 1 episode_duration = time.perf_counter( ) - self.episode_starts[i] self.finished_episode_rewards[i].append( self.episode_returns[i]) self.finished_episode_durations[i].append(episode_duration) self.finished_episode_timesteps[i].append( self.episode_timesteps[i]) self.log_finished_episode( reward=self.episode_returns[i], duration=episode_duration, timesteps=self.episode_timesteps[i], env_num=i) # Reset this environment and its preprocecssor stack. env_states[i] = self.vector_env.reset(i) if self.worker_executes_preprocessing and self.preprocessors[ env_id] is not None: self.preprocessors[env_id].reset() # This re-fills the sequence with the reset state. state = self.agent.state_space.force_batch( env_states[i]) # Pre - process, add to buffer self.preprocessed_states_buffer[i] = np.array( self.preprocessors[env_id].preprocess(state)) self.state_is_preprocessed[env_id] = True self.episode_returns[i] = 0 self.episode_timesteps[i] = 0 self.episode_starts[i] = time.perf_counter() else: # Otherwise assign states to next states env_states[i] = next_states[i] if self.worker_executes_preprocessing and self.preprocessors[ env_id] is not None: next_state = self.agent.state_space.force_batch( env_states[i]) next_states[i] = np.array( self.preprocessors[env_id].preprocess(next_state)) # TODO: If worker does not execute preprocessing, next state is not preprocessed here. # Observe per environment. self.agent.observe(preprocessed_states=preprocessed_states[i], actions=actions[i], internals=[], rewards=env_rewards[i], next_states=next_states[i], terminals=episode_terminals[i], env_id=self.env_ids[i]) self.update_if_necessary() timesteps_executed += self.num_environments num_timesteps_reached = (0 < num_timesteps <= timesteps_executed) if 0 < num_episodes <= episodes_executed or num_timesteps_reached: break total_time = (time.perf_counter() - start) or 1e-10 # Return values for current episode(s) if None have been completed. if episodes_executed == 0: mean_episode_runtime = 0 mean_episode_reward = np.mean(self.episode_returns) max_episode_reward = np.max(self.episode_returns) final_episode_reward = self.episode_returns[0] else: all_finished_durations = [] all_finished_rewards = [] for i in range_(self.num_environments): all_finished_rewards.extend(self.finished_episode_rewards[i]) all_finished_durations.extend( self.finished_episode_durations[i]) mean_episode_runtime = np.mean(all_finished_durations) mean_episode_reward = np.mean(all_finished_rewards) max_episode_reward = np.max(all_finished_rewards) final_episode_reward = all_finished_rewards[-1] self.episode_terminals = episode_terminals self.env_states = env_states results = dict( runtime=total_time, # Agent act/observe throughput. timesteps_executed=timesteps_executed, ops_per_second=(timesteps_executed / total_time), # Env frames including action repeats. env_frames=self.env_frames, env_frames_per_second=(self.env_frames / total_time), episodes_executed=episodes_executed, episodes_per_minute=(episodes_executed / (total_time / 60)), mean_episode_runtime=mean_episode_runtime, mean_episode_reward=mean_episode_reward, max_episode_reward=max_episode_reward, final_episode_reward=final_episode_reward) # Total time of run. self.logger.info("Finished execution in {} s".format(total_time)) # Total (RL) timesteps (actions) done (and timesteps/sec). self.logger.info("Time steps (actions) executed: {} ({} ops/s)".format( results['timesteps_executed'], results['ops_per_second'])) # Total env-timesteps done (including action repeats) (and env-timesteps/sec). self.logger.info( "Env frames executed (incl. action repeats): {} ({} frames/s)". format(results['env_frames'], results['env_frames_per_second'])) # Total episodes done (and episodes/min). self.logger.info("Episodes finished: {} ({} episodes/min)".format( results['episodes_executed'], results['episodes_per_minute'])) self.logger.info("Mean episode runtime: {}s".format( results['mean_episode_runtime'])) self.logger.info("Mean episode reward: {}".format( results['mean_episode_reward'])) self.logger.info("Max. episode reward: {}".format( results['max_episode_reward'])) self.logger.info("Final episode reward: {}".format( results['final_episode_reward'])) return results
def from_spec(cls, spec=None, **kwargs): """ Uses the given spec to create an object. If `spec` is a dict, an optional "type" key can be used as a "constructor hint" to specify a certain class of the object. If `spec` is not a dict, `spec`'s value is used directly as the "constructor hint". The rest of `spec` (if it's a dict) will be used as kwargs for the (to-be-determined) constructor. Additional keys in **kwargs will always have precedence (overwrite keys in `spec` (if a dict)). Also, if the spec-dict or **kwargs contains the special key "_args", it will be popped from the dict and used as *args list to be passed separately to the constructor. The following constructor hints are valid: - None: Use `cls` as constructor. - An already instantiated object: Will be returned as is; no constructor call. - A string or an object that is a key in `cls`'s `__lookup_classes__` dict: The value in `__lookup_classes__` for that key will be used as the constructor. - A python callable: Use that as constructor. - A string: Either a json filename or the name of a python module+class (e.g. "rlgraph.components.Component") to be Will be used to Args: spec (Optional[dict]): The specification dict. Keyword Args: kwargs (any): Optional possibility to pass the c'tor arguments in here and use spec as the type-only info. Then we can call this like: from_spec([type]?, [**kwargs for ctor]) If `spec` is already a dict, then `kwargs` will be merged with spec (overwriting keys in `spec`) after "type" has been popped out of `spec`. If a constructor of a Specifiable needs an *args list of items, the special key `_args` can be passed inside `kwargs` with a list type value (e.g. kwargs={"_args": [arg1, arg2, arg3]}). Returns: The object generated from the spec. """ # specifiable_type is already a created object of this class -> Take it as is. if isinstance(spec, cls): return spec # `specifiable_type`: Indicator for the Specifiable's constructor. # `ctor_args`: *args arguments for the constructor. # `ctor_kwargs`: **kwargs arguments for the constructor. # Copy so caller can reuse safely. spec = deepcopy(spec) if isinstance(spec, dict): if "type" in spec: specifiable_type = spec.pop("type", None) else: specifiable_type = None ctor_kwargs = spec ctor_kwargs.update(kwargs) # give kwargs priority else: specifiable_type = spec ctor_kwargs = kwargs # Special `_args` field in kwargs for *args-utilizing constructors. ctor_args = ctor_kwargs.pop("_args", []) # Figure out the actual constructor (class) from `type_`. # None: Try __default__object (if no args/kwargs), only then constructor of cls (using args/kwargs). if specifiable_type is None: # We have a default constructor that was defined directly by cls (not by its children). if cls.__default_constructor__ is not None and ctor_args == [] and \ (not hasattr(cls.__bases__[0], "__default_constructor__") or cls.__bases__[0].__default_constructor__ is None or cls.__bases__[0].__default_constructor__ is not cls.__default_constructor__ ): constructor = cls.__default_constructor__ # Default partial's keywords into ctor_kwargs. if isinstance(constructor, partial): kwargs = default_dict(ctor_kwargs, constructor.keywords) constructor = partial(constructor.func, **kwargs) ctor_kwargs = {} # erase to avoid duplicate kwarg error # Try our luck with this class itself. else: constructor = cls # Try the __lookup_classes__ of this class. else: constructor = cls.lookup_class(specifiable_type) # Found in cls.__lookup_classes__. if constructor is not None: pass # Python callable. elif callable(specifiable_type): constructor = specifiable_type # A string: Filename or a python module+class. elif isinstance(specifiable_type, str): if re.search(r'\.(yaml|yml|json)$', specifiable_type): return cls.from_file(specifiable_type, *ctor_args, **ctor_kwargs) elif specifiable_type.find('.') != -1: module_name, function_name = specifiable_type.rsplit(".", 1) module = importlib.import_module(module_name) constructor = getattr(module, function_name) else: raise RLGraphError( "ERROR: String specifier ({}) in from_spec must be a filename, a module+class, or a key " "into {}.__lookup_classes__!".format(specifiable_type, cls.__name__) ) if not constructor: raise RLGraphError("Invalid type: {}".format(specifiable_type)) # Create object with inferred constructor. specifiable_object = constructor(*ctor_args, **ctor_kwargs) assert isinstance(specifiable_object, constructor.func if isinstance(constructor, partial) else constructor) return specifiable_object
def parse_execution_spec(execution_spec): """ Parses execution parameters and inserts default values where necessary. Args: execution_spec (Optional[dict]): Execution spec dict. Must specify an execution mode "single" or "distributed". If mode "distributed", must specify a "distributed_spec" containing: - a key cluster_spec mapping to a ClusterSpec object, - a "job" for the job name, - an integer "task_index" Returns: dict: The sanitized execution_spec dict. """ # TODO these are tensorflow specific # If no spec given. if get_backend() == "tf": default_spec = dict( mode="single", distributed_spec=None, # Using a monitored session enabling summaries and hooks per default. disable_monitoring=False, # Gpu settings. gpu_spec=dict( # Are GPUs allowed to be used if they are detected? gpus_enabled=False, # If yes, how many GPUs are to be used? max_usable_gpus=0, # If True, use `max_usable_gpus` fake-GPUs (CPU) iff no GPUs are available. fake_gpus_if_necessary=False, # Specify specific CUDA devices to be used, e.g. gpu 0 and 2 = [0, 2]. # If None, we use CUDA devices [0, max_usable_gpus - 1] cuda_devices=None, # Fraction of the overall amount of memory that each visible GPU should be allocated. per_process_gpu_memory_fraction=None, # If True, not all memory will be allocated which is relevant on shared resources. allow_memory_growth=False), # Device placement settings. device_strategy="default", default_device=None, device_map={}, session_config=None, # Random seed for the tf graph. seed=None, # Enabling the tf profiler? enable_profiler=False, # With which frequency do we print out profiler information? profiler_frequency=1000, # Enabling a timeline write? enable_timeline=False, # With which frequency do we write out a timeline file? timeline_frequency=1, ) execution_spec = default_dict(execution_spec, default_spec) # Sub specifications: # Distributed specifications. if execution_spec.get("mode") == "distributed": default_distributed = dict(job="ps", task_index=0, cluster_spec=dict( ps=["localhost:22222"], worker=["localhost:22223"]), protocol=None) execution_spec["distributed_spec"] = default_dict( execution_spec.get("distributed_spec"), default_distributed) # Session config. default_session_config = dict(type="monitored-training-session", allow_soft_placement=True, log_device_placement=False) execution_spec["session_config"] = default_dict( execution_spec.get("session_config"), default_session_config) elif get_backend() == "pytorch": # No session configs, different GPU options. default_spec = dict( mode="single", distributed_spec=None, # Using a monitored session enabling summaries and hooks per default. disable_monitoring=False, # Gpu settings. gpu_spec=dict( # Are GPUs allowed to be used if they are detected? gpus_enabled=False, # If yes, how many GPUs are to be used? max_usable_gpus=0, # Specify specific CUDA devices to be used, e.g. gpu 0 and 2 = [0, 2]. # If None, we use CUDA devices [0, max_usable_gpus - 1] cuda_devices=None), # Device placement settings. device_strategy="default", default_device=None, device_map={}, # TODO potentially set to nproc? torch_num_threads=1, OMP_NUM_THREADS=1) execution_spec = default_dict(execution_spec, default_spec) return execution_spec
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function from rlgraph.utils.util import default_dict # Basics. from rlgraph.components.layers.layer import Layer # Preprocessing Layers. from rlgraph.components.layers.preprocessing import * # NN-Layers. from rlgraph.components.layers.nn import * # String Layers. from rlgraph.components.layers.strings import * # The Layers (Layers are also Stacks). Layer.__lookup_classes__ = dict(nnlayer=NNLayer, preprocesslayer=PreprocessLayer) # Add all specific Layer sub-classes to this one. default_dict(Layer.__lookup_classes__, NNLayer.__lookup_classes__) default_dict(Layer.__lookup_classes__, PreprocessLayer.__lookup_classes__) default_dict(Layer.__lookup_classes__, StringLayer.__lookup_classes__) __all__ = ["Layer"] + \ list(set(map(lambda x: x.__name__, Layer.__lookup_classes__.values())))
def _execute(self, num_timesteps=None, num_episodes=None, max_timesteps_per_episode=None, use_exploration=True, update_spec=None, frameskip=None, reset=True): """ Actual implementation underlying `execute_timesteps` and `execute_episodes`. Args: num_timesteps (Optional[int]): The maximum number of timesteps to run. At least one of `num_timesteps` or `num_episodes` must be provided. num_episodes (Optional[int]): The maximum number of episodes to run. At least one of `num_timesteps` or `num_episodes` must be provided. use_exploration (Optional[bool]): Indicates whether to utilize exploration (epsilon or noise based) when picking actions. Default: True. max_timesteps_per_episode (Optional[int]): Can be used to limit the number of timesteps per episode. Use None or 0 for no limit. Default: None. update_spec (Optional[dict]): Update parameters. If None, the worker only performs rollouts. Matches the structure of an Agent's update_spec dict and will be "defaulted" by that dict. See `input_parsing/parse_update_spec.py` for more details. frameskip (Optional[int]): How often actions are repeated after retrieving them from the agent. Rewards are accumulated over the number of skips. Use None for the Worker's default value. reset (bool): Whether to reset the environment and all the Worker's internal counters. Default: True. Returns: dict: Execution statistics. """ assert num_timesteps is not None or num_episodes is not None,\ "ERROR: One of `num_timesteps` or `num_episodes` must be provided!" # Are we updating or just acting/observing? update_spec = default_dict(update_spec, self.agent.update_spec) self.set_update_schedule(update_spec) num_timesteps = num_timesteps or 0 num_episodes = num_episodes or 0 max_timesteps_per_episode = [ max_timesteps_per_episode or 0 for _ in range_(self.num_environments) ] frameskip = frameskip or self.frameskip # Stats. timesteps_executed = 0 episodes_executed = 0 start = time.perf_counter() episode_terminals = self.episode_terminals if reset is True: self.env_frames = 0 self.episodes_since_update = 0 self.finished_episode_rewards = [ [] for _ in range_(self.num_environments) ] self.finished_episode_durations = [ [] for _ in range_(self.num_environments) ] self.finished_episode_timesteps = [ [] for _ in range_(self.num_environments) ] for i, env_id in enumerate(self.env_ids): self.episode_returns[i] = 0 self.episode_timesteps[i] = 0 self.episode_terminals[i] = False self.episode_starts[i] = time.perf_counter() if self.worker_executes_preprocessing: self.state_is_preprocessed[env_id] = False self.env_states = self.vector_env.reset_all() self.agent.reset() elif self.env_states[0] is None: raise RLGraphError( "Runner must be reset at the very beginning. Environment is in invalid state." ) # Only run everything for at most num_timesteps (if defined). env_states = self.env_states while not (0 < num_timesteps <= timesteps_executed): if self.render: self.vector_env.render() if self.worker_executes_preprocessing: for i, env_id in enumerate(self.env_ids): state = self.agent.state_space.force_batch(env_states[i]) if self.preprocessors[env_id] is not None: if self.state_is_preprocessed[env_id] is False: self.preprocessed_states_buffer[ i] = self.preprocessors[env_id].preprocess( state) self.state_is_preprocessed[env_id] = True else: self.preprocessed_states_buffer[i] = env_states[i] # TODO extra returns when worker is not applying preprocessing. actions = self.agent.get_action( states=self.preprocessed_states_buffer, use_exploration=use_exploration, apply_preprocessing=self.apply_preprocessing) preprocessed_states = np.array(self.preprocessed_states_buffer) else: actions, preprocessed_states = self.agent.get_action( states=np.array(env_states), use_exploration=use_exploration, apply_preprocessing=True, extra_returns="preprocessed_states") # Accumulate the reward over n env-steps (equals one action pick). n=self.frameskip. env_rewards = [0 for _ in range_(self.num_environments)] next_states = None # For Dict action spaces, we have to treat each key as an array with batch-rank at index 0. # The action-dict is then translated into a list of dicts where each dict contains the original data # but without the batch-rank. # E.g. {'A': array([0, 1]), 'B': array([2, 3])} -> [{'A': 0, 'B': 2}, {'A': 1, 'B': 3}] if isinstance(self.agent.action_space, Dict): some_key = next(iter(actions)) assert isinstance(actions, dict) and isinstance(actions[some_key], np.ndarray),\ "ERROR: Cannot flip Dict-action batch with dict keys if returned value is not a dict OR " \ "values of returned value are not np.ndarrays!" # TODO: What if actions come as nested dicts (more than one level deep)? # TODO: Use DataOpDict/Tuple's new `map` method. if hasattr(actions[some_key], "__len__"): env_actions = [{ key: value[i] for key, value in actions.items() } for i in range(len(actions[some_key]))] else: # Action was not array type. env_actions = [{ key: value for key, value in actions.items() }] # Tuple action Spaces: # E.g. Tuple(array([0, 1]), array([2, 3])) -> [(0, 2), (1, 3)] elif isinstance(self.agent.action_space, Tuple): assert isinstance(actions, tuple) and isinstance(actions[0], np.ndarray),\ "ERROR: Cannot flip tuple-action batch if returned value is not a tuple OR " \ "values of returned value are not np.ndarrays!" # TODO: Use DataOpDict/Tuple's new `map` method. env_actions = [ tuple(value[i] for _, value in enumerate(actions)) for i in range(len(actions[0])) ] # No container batch-flipping necessary. else: env_actions = actions if self.num_environments == 1 and env_actions.shape == (): env_actions = [env_actions] for _ in range_(frameskip): next_states, step_rewards, episode_terminals, _ = self.vector_env.step( actions=env_actions) self.env_frames += self.num_environments for i, step_reward in enumerate(step_rewards): env_rewards[i] += step_reward if np.any(episode_terminals): break # Only render once per action. #if self.render: # self.vector_env.environments[0].render() for i, env_id in enumerate(self.env_ids): self.episode_returns[i] += env_rewards[i] self.episode_timesteps[i] += 1 if 0 < max_timesteps_per_episode[i] <= self.episode_timesteps[ i]: episode_terminals[i] = True if self.worker_executes_preprocessing: self.state_is_preprocessed[env_id] = False # Do accounting for finished episodes. if episode_terminals[i]: episodes_executed += 1 self.episodes_since_update += 1 episode_duration = time.perf_counter( ) - self.episode_starts[i] self.finished_episode_rewards[i].append( self.episode_returns[i]) self.finished_episode_durations[i].append(episode_duration) self.finished_episode_timesteps[i].append( self.episode_timesteps[i]) self.log_finished_episode( episode_return=self.episode_returns[i], duration=episode_duration, timesteps=self.episode_timesteps[i], env_num=i) # Reset this environment and its preprocecssor stack. env_states[i] = self.vector_env.reset(i) if self.worker_executes_preprocessing and self.preprocessors[ env_id] is not None: self.preprocessors[env_id].reset() # This re-fills the sequence with the reset state. state = self.agent.state_space.force_batch( env_states[i]) # Pre - process, add to buffer self.preprocessed_states_buffer[i] = np.array( self.preprocessors[env_id].preprocess(state)) self.state_is_preprocessed[env_id] = True self.episode_returns[i] = 0 self.episode_timesteps[i] = 0 self.episode_starts[i] = time.perf_counter() else: # Otherwise assign states to next states env_states[i] = next_states[i] if self.worker_executes_preprocessing and self.preprocessors[ env_id] is not None: #next_state = self.agent.state_space.force_batch(env_states[i]) next_states[i] = np.array( self.preprocessors[env_id].preprocess( env_states[i])) # next_state self._observe(self.env_ids[i], preprocessed_states[i], env_actions[i], env_rewards[i], next_states[i], episode_terminals[i]) self.update_if_necessary() timesteps_executed += self.num_environments num_timesteps_reached = (0 < num_timesteps <= timesteps_executed) if 0 < num_episodes <= episodes_executed or num_timesteps_reached: break total_time = (time.perf_counter() - start) or 1e-10 # Return values for current episode(s) if None have been completed. if episodes_executed == 0: mean_episode_runtime = 0 mean_episode_reward = np.mean(self.episode_returns) max_episode_reward = np.max(self.episode_returns) final_episode_reward = self.episode_returns[0] else: all_finished_durations = [] all_finished_rewards = [] for i in range_(self.num_environments): all_finished_rewards.extend(self.finished_episode_rewards[i]) all_finished_durations.extend( self.finished_episode_durations[i]) mean_episode_runtime = np.mean(all_finished_durations) mean_episode_reward = np.mean(all_finished_rewards) max_episode_reward = np.max(all_finished_rewards) final_episode_reward = all_finished_rewards[-1] self.episode_terminals = episode_terminals self.env_states = env_states results = dict( runtime=total_time, # Agent act/observe throughput. timesteps_executed=timesteps_executed, ops_per_second=(timesteps_executed / total_time), # Env frames including action repeats. env_frames=self.env_frames, env_frames_per_second=(self.env_frames / total_time), episodes_executed=episodes_executed, episodes_per_minute=(episodes_executed / (total_time / 60)), mean_episode_runtime=mean_episode_runtime, mean_episode_reward=mean_episode_reward, max_episode_reward=max_episode_reward, final_episode_reward=final_episode_reward) # Total time of run. self.logger.info("Finished execution in {} s".format(total_time)) # Total (RL) timesteps (actions) done (and timesteps/sec). self.logger.info("Time steps (actions) executed: {} ({} ops/s)".format( results['timesteps_executed'], results['ops_per_second'])) # Total env-timesteps done (including action repeats) (and env-timesteps/sec). self.logger.info( "Env frames executed (incl. action repeats): {} ({} frames/s)". format(results['env_frames'], results['env_frames_per_second'])) # Total episodes done (and episodes/min). self.logger.info("Episodes finished: {} ({} episodes/min)".format( results['episodes_executed'], results['episodes_per_minute'])) self.logger.info("Mean episode runtime: {}s".format( results['mean_episode_runtime'])) self.logger.info("Mean episode reward: {}".format( results['mean_episode_reward'])) self.logger.info("Max. episode reward: {}".format( results['max_episode_reward'])) self.logger.info("Final episode reward: {}".format( results['final_episode_reward'])) return results