def __init__(self, name, state_space, action_space, reward_space=None, rl_algo=None): """ Args: name (str): Some name for this Actor. state_space (Space): The state Space that this Actor will receive from the Env. action_space (Space): The action Space that this Actor will be able to execute on. reward_space (Optional[Space]: The reward space that this actor will use. Default: float. rl_algo (Optional[RLAlgo]): The RLAlgo that this Actor will query for actions given some observation state from the Env. """ super().__init__() # Some unique name for this Actor. self.name = name # The Algo controlling this Actor. self.rl_algo = rl_algo # type: RLAlgo # The state Space (observations of this Actor). self.state_space = Space.make(state_space) # The action Space. self.action_space = Space.make(action_space) # The reward Space (will default to float if None). self.reward_space = Space.make(reward_space)
def __init__(self, config, name=None): super().__init__(config, name) self.Phi = Preprocessor.make(config.preprocessor) self.x = self.Phi(Space.make( config.state_space).with_batch()) # preprocessed states (x) self.a = Space.make(config.action_space).with_batch() # actions (a) self.Q = Network.make( network=config.q_network, input_space=self.x, output_space=Dict( A=self.a, V=Float().with_batch()), # dueling network outputs adapters=dict(A=dict(pre_network=config.dueling_a_network), V=dict(pre_network=config.dueling_v_network))) self.Qt = self.Q.copy(trainable=False) self.memory = PrioritizedReplayBuffer.make( record_space=Dict(dict(s=self.x, a=self.a, r=float, t=bool, n=int), main_axes="B"), capacity=config.memory_capacity, alpha=config.memory_alpha, beta=config.memory_beta, next_record_setup=dict(s="s_", n_step=config.n_step)) self.n_step = NStep(config.gamma, n_step=config.n_step, n_step_only=True) # N-step component self.L = DDDQNLoss() # double/dueling/n-step Q-loss self.optimizer = Optimizer.make(self.config.optimizer) self.epsilon = Decay.make( self.config.epsilon) # for epsilon greedy learning self.Phi.reset() # make sure, Preprocessor is clean
def __init__(self, config, name=None): super().__init__(config, name) self.preprocessor = Preprocessor.make(config.preprocessor) self.s = self.preprocessor(Space.make(config.state_space).with_batch()) # preprocessed states (x) self.a = Space.make(config.action_space).with_batch() # actions (a) self.a_soft = self.a.as_one_hot_float_space() # soft-one-hot actions (if Int elements in action space) self.pi = Network.make(distributions=dict( # policy (π) bounded_distribution_type=config.bounded_distribution_type, discrete_distribution_type="gumbel-softmax", gumbel_softmax_temperature=config.gumbel_softmax_temperature ), input_space=self.s, output_space=self.a, **config.policy_network) self.Q = [] # the Q-networks for i in range(config.num_q_networks): self.Q.append(Network.make(input_space=Dict(s=self.s, a=self.a), output_space=float, **config.q_network)) self.Qt = [self.Q[i].copy(trainable=False) for i in range(config.num_q_networks)] # target q-network(s) record_space = Dict(default_dict(dict(s=self.s, a=self.a_soft, r=float, t=bool), {"n": int} if config.n_step > 1 else {}), main_axes="B") self.memory = Memory.make(record_space=record_space, **config.memory_spec) self.alpha = tf.Variable(config.initial_alpha, name="alpha", dtype=tf.float32) # the temperature parameter α self.entropy_target = Decay.make(config.entropy_target) self.n_step = NStep(config.gamma, n_step=config.n_step, n_step_only=True) self.L, self.Ls_critic, self.L_actor, self.L_alpha = SACLoss(), [0, 0], 0, 0 # SAC loss function and values. # TEST self.log_pi, self.entropy_error_term, self.log_alpha = 0, 0, 0 # END: TEST self.optimizers = dict( q=Optimizer.make(self.config.q_optimizer), pi=Optimizer.make(self.config.policy_optimizer), alpha=Optimizer.make(self.config.alpha_optimizer) ) self.preprocessor.reset() # make sure, Preprocessor is clean
def __init__(self, input_space): """ Args: input_space (Space): The input space """ super().__init__() self.input_space = Space.make(input_space) # How many samples have we seen (after last reset)? self.sample_count = None # Current estimate of the mean. self.mean_est = None # Current estimate of the sum of stds. self.std_sum_est = None self.reset()
def __init__(self, network, *, output_space, adapters=None, distributions=False, deterministic=False, input_space=None, pre_concat_networks=None, auto_flatten_inputs=True): """ Args: network (Union[tf.keras.models.Model,tf.keras.layers.Layer,callable]): The neural network callable (w/o the final action-layer) for this function approximator. output_space (Space): The output Space (may be a ContainerSpace). adapters (dict): distributions (Union[Dict,bool,str]): Distribution specification for the different output components. Supported values are: Dict[str,any]: A dictionary, matching the output space's structure and specifying for each component, what the distribution should be (or False/None for no distribution). bool: True if all components should have the default distribution according to their Space type. False if no component should have a distribution. "default": See True. None: See False. Values of True/False/"default"/None may also be given inside a nested dict (see Dict above) for specific components of the output space. deterministic (bool): Whether to sample (from possible distributions) deterministically. Default: False (stochastic sampling). input_space (Optional[Space]): Input space may be provided to ensure immediate build of the network (and its variables). Also, if it's a ContainerSpace, will build additional "pre-concat" NNs, through which input components are passed befor ebeing concat'd and sent further through the main NN. pre_concat_networks (Union[Dict,Tuple]): The neural network callable(s) for the different input components. Only applicable if `input_space` is given an a ContainerSpace. auto_flatten_inputs (bool): If True, will try to automatically flatten (or one-hot) all input components, but only if for that input-component, no `pre_concat_network` has been specified. For Int: One-hot along all non-main-axes. E.g. [[2, 3], [1, 2]] -> [0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0] For Float: Flatten along all non-main axes. E.g. [[2.0, 3.0], [1.0, 2.0]] -> [2.0 3.0 1.0 2.0] For Bool: Flatten along all non-main axes and convert to 0.0 (False) or 1.0 (True). Default: True. """ super().__init__() # Store the given tf.keras.Model. self.network = network # Whether distribution outputs should be sampled deterministically. self.deterministic = deterministic # Create the output adapters. self.output_space = None self.flat_output_space = None # The adapters linking the main NN's output to the output layer(s)/distributions. self.adapters = [] # The distributions to use (if any) for different components of the output space. self.distributions = [] self._create_adapters_and_distributions(output_space, adapters, distributions) # Input space given explicitly. self.input_space = Space.make( input_space).with_batch() if input_space is not None else None self.flat_input_space = None self.pre_concat_networks = [] # One per input component. if self.input_space is not None: # If container space, build input NNs, then concat and connect to `self.network`. if isinstance(self.input_space, ContainerSpace): self._create_pre_concat_networks(pre_concat_networks, auto_flatten_inputs) # Push through a sample to build our weights. self(self.input_space.sample())
def _create_adapters_and_distributions(self, output_space, adapters, distributions): if output_space is None: adapter = DistributionAdapter.make(adapters) self.output_space = adapter.output_space # Assert single component output space. assert isinstance(self.output_space, PrimitiveSpace), \ "ERROR: Output space must not be ContainerSpace if no `output_space` is given in Network constructor!" else: self.output_space = Space.make(output_space) self.flat_output_space = tf.nest.flatten(self.output_space) # Find out whether we have a generic adapter-spec (one for all output components). generic_adapter_spec = None if isinstance(adapters, dict) and not any(key in adapters for key in self.output_space): generic_adapter_spec = adapters # adapters may be incomplete (add Nones to non-defined leafs). elif isinstance(adapters, dict): adapters = complement_struct(adapters, reference_struct=self.output_space) flat_output_adapter_spec = flatten_alongside( adapters, alongside=self.output_space) # Find out whether we have a generic distribution-spec (one for all output components). generic_distribution_spec = None if isinstance(self.output_space, PrimitiveSpace) or \ (isinstance(distributions, dict) and not any(key in distributions for key in self.output_space)): generic_distribution_spec = distributions flat_distribution_spec = tf.nest.map_structure( lambda s: distributions, self.flat_output_space) else: # adapters may be incomplete (add Nones to non-defined leafs). if isinstance(distributions, dict): distributions = complement_struct( distributions, reference_struct=self.output_space) # No distributions whatsoever. elif not distributions: distributions = complement_struct( {}, reference_struct=self.output_space) # Use default distributions (depending on output-space(s)). elif distributions is True or distributions == "default": distributions = complement_struct( {}, reference_struct=self.output_space, value=True) flat_distribution_spec = tf.nest.flatten(distributions) # Figure out our Distributions. for i, output_component in enumerate(self.flat_output_space): # Generic spec -> Use it. if generic_adapter_spec: da_spec = copy.deepcopy(generic_adapter_spec) da_spec["output_space"] = output_component # Spec dict -> find setting in possibly incomplete spec. elif isinstance(adapters, dict): # If not specified in dict -> auto-generate AA-spec. da_spec = flat_output_adapter_spec[i] da_spec["output_space"] = output_component # Simple type spec. elif not isinstance(adapters, DistributionAdapter): da_spec = dict(output_space=output_component) # Direct object. else: da_spec = adapters # We have to get the type of the adapter from a distribution. if isinstance(da_spec, dict) and "type" not in da_spec: # Single distribution settings for all output components. if generic_distribution_spec is not None: settings = {} if generic_distribution_spec in [ "default", True, False ] else (generic_distribution_spec or {}) else: settings = flat_distribution_spec[i] if isinstance( flat_distribution_spec[i], dict) else {} # `distributions` could be simply a direct spec dict. if (isinstance(settings, dict) and "type" in settings) or isinstance( settings, Distribution): dist_spec = settings else: dist_spec = get_default_distribution_from_space( output_component, **settings) # No distribution. if not generic_distribution_spec and not flat_distribution_spec[ i]: self.distributions.append(None) # Some distribution. else: self.distributions.append(Distribution.make(dist_spec)) if self.distributions[-1] is None: raise SurrealError( "`output_component` is of type {} and not allowed in {} Component!" .format( type(output_space).__name__, type(self).__name__)) # Special case: No distribution AND float -> plain output adapter. if not generic_distribution_spec and \ (not flat_distribution_spec[i] and isinstance(da_spec["output_space"], Float)): da_spec["type"] = "plain-output-adapter" # All other cases: Get adapter type from distribution spec # (even if we don't use a distribution in the end). else: default_dict( da_spec, get_adapter_spec_from_distribution_spec(dist_spec)) self.adapters.append(DistributionAdapter.make(da_spec)) # da_spec is completely defined -> Use it to get distribution. else: self.adapters.append(DistributionAdapter.make(da_spec)) if distributions[i]: dist_spec = get_distribution_spec_from_adapter( self.adapters[-1]) self.distributions.append(Distribution.make(dist_spec))
def __init__( self, *, policy_network, q_network, state_space, action_space, sac_config, num_q_experts=4, # 4 used in paper. q_predicts_states_diff=False, num_denominator_samples_for_ri=250, # 50-500 used in paper dim_skill_vectors=10, discrete_skills=False, episode_horizon=200, skill_horizon=None, preprocessor=None, supervised_optimizer=None, num_steps_per_supervised_update=1, episode_buffer_capacity=200, summaries=None): """ Args: policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy. q_network (Network): The dynamics-network (q) to use as a function approximator for the learnt env dynamics. NOTE: Not to be confused with a Q-learning Q-net! In the paper, the dynamics function is called `q`, hence the same nomenclature here. state_space (Space): The state/observation Space. action_space (Space): The action Space. sac_config (SACConfig): The config for the internal SAC-Algo used to learn the skills using intrinsic rewards. num_q_experts (int): The number of experts used in the Mixture distribution output bz the q-network to predict the next state (s') given s (state) and z (skill vector). q_predicts_states_diff (bool): Whether the q-network predicts the different between s and s' rather than s' directly. Default: False. num_denominator_samples_for_ri (int): The number of samples to calculate for the denominator of the intrinsic reward function (`L` in the paper). dim_skill_vectors (int): The number of dimensions of the learnt skill vectors. discrete_skills (bool): Whether skill vectors are discrete (one-hot). episode_horizon (int): The episode horizon (He) to move within, when gathering episode samples. skill_horizon (Optional[int]): The horizon for which to use one skill vector (before sampling a new one). Default: Use value of `episode_horizon`. preprocessor (Preprocessor): The preprocessor (if any) to use. supervised_optimizer (Optimizer): The optimizer to use for the supervised (q) model learning task. num_steps_per_supervised_update (int): The number of gradient descent iterations per update (each iteration uses the same environment samples). episode_buffer_capacity (int): The capacity of the episode (experience) FIFOBuffer. summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true. In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should be tracked after each tick. """ # Clean up network configs to be passable as **kwargs to `make`. # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec. if isinstance( policy_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): policy_network = dict(network=policy_network) if isinstance( q_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): q_network = dict(network=q_network) # Make state/action space. state_space = Space.make(state_space) action_space = Space.make(action_space) # Fix SAC config, add correct state- and action-spaces. sac_config = SACConfig.make( sac_config, state_space=Dict(s=state_space, z=Float(-1.0, 1.0, shape=(dim_skill_vectors, ))), action_space=action_space, # Use no memory. Updates are done from DADS' own buffer. memory_capacity=1, memory_batch_size=1, # Share policy network between DADS and underlying learning SAC. policy_network=policy_network) if skill_horizon is None: skill_horizon = episode_horizon super().__init__( locals()) # Config will store all c'tor variables automatically. # Keep track of which time-step stuff happened. Only important for by-time-step frequencies. self.last_update = 0
def __init__( self, *, q_network, state_space, action_space, policy_network=None, preprocessor=None, default_optimizer=None, q_optimizer=None, policy_optimizer=None, alpha_optimizer=None, optimize_alpha=True, bounded_distribution_type="squashed-normal", gumbel_softmax_temperature=1.0, gamma=0.99, num_q_networks=2, memory_capacity=10000, memory_batch_size=256, use_prioritized_replay=False, memory_alpha=1.0, memory_beta=0.0, initial_alpha=1.0, entropy_target=None, # default: -dim(A), but this won't work for Atari. n_step=1, max_time_steps=None, update_after=0, update_frequency=1, num_steps_per_update=1, sync_frequency=1, sync_tau=0.005, time_unit="time_step", summaries=None ): """ Args: q_network (Network): The Q-network to use as a function approximator for the learnt Q-function. state_space (Space): The state/observation Space. action_space (Space): The action Space. policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy. Default: Use the same setup as the q-network(s). preprocessor (Preprocessor): The preprocessor (if any) to use. default_optimizer (Optimizer): The optimizer to use for any Q/pi/alpha, which don't have their own defined. q_optimizer (Optimizer): The optimizer to use for the Q-network. If None, use `optimizer`. policy_optimizer (Optimizer): The optimizer to use for the policy (pi). If None, use `optimizer`. alpha_optimizer (Optimizer): The optimizer to use for the alpha parameter. If None, use `optimizer`. optimize_alpha (bool): Whether to use the alpha loss term and an optimizer step to update alpha. False for keeping alpha constant at `initial_alpha`. bounded_distribution_type (str): Which distribution type to use for continuous, bounded output spaces. Must be a Distribution class type string. See components/distributions/__init__.py gumbel_softmax_temperature (float): Iff `discrete_distribution_type`="gumbel-softmax" (which is fixed and required for SAC), which temperature parameter to use. gamma (float): The discount factor (gamma). memory_capacity (int): The memory's capacity (max number of records to store). memory_batch_size (int): The batch size to use for updating from memory. use_prioritized_replay (bool): Whether to use a PrioritizedReplayBuffer (instead of a plain ReplayBuffer). memory_alpha (float): The alpha value for the PrioritizedReplayBuffer. memory_beta (float): The beta value for the PrioritizedReplayBuffer. initial_alpha (float): The initial value for alpha (before optimization). entropy_target (float): The value of "Hbar" in the loss function for alpha. Default is -dim(A). n_step (int): The number of steps (n) to "look ahead/back" when converting 1-step tuples into n-step ones. #n_step_only (bool): Whether to exclude samples that are shorter than `n_step` AND don't have a terminal # at the end. max_time_steps (Optional[int]): The maximum number of time steps (across all actors) to learn/update. If None, use a value given by the environment. update_after (Union[int,str]): The `time_unit`s to wait before starting any updates. Special values (only valid iff time_unit == "time_step"!): - "when-memory-full" for same as `memory_capacity`. - when-memory-ready" for same as `memory_batch_size`. update_frequency (int): The frequency (in `time_unit`) with which to update our Q-network. num_steps_per_update (int): The number of gradient descent iterations per update (each iteration uses a different sample). sync_frequency (int): The frequency (in `time_unit`) with which to sync our target network. sync_tau (float): The target smoothing coefficient with which to synchronize the target Q-network. time_unit (str["time_step","env_tick"]): The time units we are using for update/sync decisions. summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true. In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should be tracked after each tick. """ # If one not given, use a copy of the other NN and make sure the given network is not a done Keras object yet. if policy_network is None: assert isinstance(q_network, (dict, list, tuple)) policy_network = q_network elif q_network is None: assert isinstance(policy_network, (dict, list, tuple)) q_network = policy_network # Clean up network configs to be passable as **kwargs to `make`. # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec. if isinstance(q_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): q_network = dict(network=q_network) if isinstance(policy_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): policy_network = dict(network=policy_network) # Make sure our optimizers are defined ok. if default_optimizer is None: assert q_optimizer and policy_optimizer and alpha_optimizer if q_optimizer and policy_optimizer and alpha_optimizer: if default_optimizer: logging.warning( "***WARNING: `default_optimizer` defined, but has no effect b/c `q_optimizer`, `policy_optimizer` " "and `alpha_optimizer` are already provided!" ) if q_optimizer is None: q_optimizer = default_optimizer if policy_optimizer is None: policy_optimizer = default_optimizer if alpha_optimizer is None: alpha_optimizer = default_optimizer assert time_unit in ["time_step", "env_tick"] # Special value for start-train parameter -> When memory full. if update_after == "when-memory-full": update_after = memory_capacity # Special value for start-train parameter -> When memory has enough records to pull a batch. elif update_after == "when-memory-ready": update_after = memory_batch_size assert isinstance(update_after, int) # Make sure sync-freq >= update-freq: assert sync_frequency >= update_frequency # Make sure memory batch size is less than capacity. assert memory_batch_size <= memory_capacity # Derive memory_spec for SAC c'tor. # If PR -> Check that alpha is not 0.0. if use_prioritized_replay is True: if memory_alpha == 0.0: logging.warning( "***WARNING: `use_prioritized_replay` is True, but memory's alpha is set to 0.0 (which implies no " "prioritization whatsoever)!" ) memory_spec = dict(type="prioritized-replay-buffer", alpha=memory_alpha, beta=memory_beta) else: memory_spec = dict(type="replay-buffer") memory_spec["capacity"] = memory_capacity memory_spec["next_record_setup"] = dict(s="s_", n_step=n_step) # setup: s' is next-record of s (after n-steps). # Make action space. action_space = Space.make(action_space) # Default Hbar: -dim(A) (according to the paper). if entropy_target is None: entropy_target = -(action_space.flat_dim_with_categories if isinstance(action_space, Int) else action_space.flat_dim) print("entropy_target={}".format(entropy_target)) super().__init__(locals()) # Config will store all c'tor variables automatically. # Keep track of which time-step stuff happened. Only important for by-time-step frequencies. self.last_update = 0 self.last_sync = 0