Exemple #1
0
    def __init__(self,
                 name,
                 state_space,
                 action_space,
                 reward_space=None,
                 rl_algo=None):
        """
        Args:
            name (str): Some name for this Actor.
            state_space (Space): The state Space that this Actor will receive from the Env.
            action_space (Space): The action Space that this Actor will be able to execute on.
            reward_space (Optional[Space]: The reward space that this actor will use.
                Default: float.

            rl_algo (Optional[RLAlgo]): The RLAlgo that this Actor will query for actions given some observation
                state from the Env.
        """
        super().__init__()

        # Some unique name for this Actor.
        self.name = name
        # The Algo controlling this Actor.
        self.rl_algo = rl_algo  # type: RLAlgo

        # The state Space (observations of this Actor).
        self.state_space = Space.make(state_space)
        # The action Space.
        self.action_space = Space.make(action_space)
        # The reward Space (will default to float if None).
        self.reward_space = Space.make(reward_space)
Exemple #2
0
 def __init__(self, config, name=None):
     super().__init__(config, name)
     self.Phi = Preprocessor.make(config.preprocessor)
     self.x = self.Phi(Space.make(
         config.state_space).with_batch())  # preprocessed states (x)
     self.a = Space.make(config.action_space).with_batch()  # actions (a)
     self.Q = Network.make(
         network=config.q_network,
         input_space=self.x,
         output_space=Dict(
             A=self.a, V=Float().with_batch()),  # dueling network outputs
         adapters=dict(A=dict(pre_network=config.dueling_a_network),
                       V=dict(pre_network=config.dueling_v_network)))
     self.Qt = self.Q.copy(trainable=False)
     self.memory = PrioritizedReplayBuffer.make(
         record_space=Dict(dict(s=self.x, a=self.a, r=float, t=bool, n=int),
                           main_axes="B"),
         capacity=config.memory_capacity,
         alpha=config.memory_alpha,
         beta=config.memory_beta,
         next_record_setup=dict(s="s_", n_step=config.n_step))
     self.n_step = NStep(config.gamma,
                         n_step=config.n_step,
                         n_step_only=True)  # N-step component
     self.L = DDDQNLoss()  # double/dueling/n-step Q-loss
     self.optimizer = Optimizer.make(self.config.optimizer)
     self.epsilon = Decay.make(
         self.config.epsilon)  # for epsilon greedy learning
     self.Phi.reset()  # make sure, Preprocessor is clean
Exemple #3
0
    def __init__(self, config, name=None):
        super().__init__(config, name)
        self.preprocessor = Preprocessor.make(config.preprocessor)
        self.s = self.preprocessor(Space.make(config.state_space).with_batch())  # preprocessed states (x)
        self.a = Space.make(config.action_space).with_batch()  # actions (a)
        self.a_soft = self.a.as_one_hot_float_space()  # soft-one-hot actions (if Int elements in action space)
        self.pi = Network.make(distributions=dict(  # policy (π)
            bounded_distribution_type=config.bounded_distribution_type, discrete_distribution_type="gumbel-softmax",
            gumbel_softmax_temperature=config.gumbel_softmax_temperature
        ), input_space=self.s, output_space=self.a, **config.policy_network)
        self.Q = []  # the Q-networks
        for i in range(config.num_q_networks):
            self.Q.append(Network.make(input_space=Dict(s=self.s, a=self.a), output_space=float, **config.q_network))
        self.Qt = [self.Q[i].copy(trainable=False) for i in range(config.num_q_networks)]  # target q-network(s)
        record_space = Dict(default_dict(dict(s=self.s, a=self.a_soft, r=float, t=bool),
                                         {"n": int} if config.n_step > 1 else {}), main_axes="B")
        self.memory = Memory.make(record_space=record_space, **config.memory_spec)
        self.alpha = tf.Variable(config.initial_alpha, name="alpha", dtype=tf.float32)  # the temperature parameter α
        self.entropy_target = Decay.make(config.entropy_target)
        self.n_step = NStep(config.gamma, n_step=config.n_step, n_step_only=True)
        self.L, self.Ls_critic, self.L_actor, self.L_alpha = SACLoss(), [0, 0], 0, 0  # SAC loss function and values.

        # TEST
        self.log_pi, self.entropy_error_term, self.log_alpha = 0, 0, 0
        # END: TEST

        self.optimizers = dict(
            q=Optimizer.make(self.config.q_optimizer), pi=Optimizer.make(self.config.policy_optimizer),
            alpha=Optimizer.make(self.config.alpha_optimizer)
        )
        self.preprocessor.reset()  # make sure, Preprocessor is clean
Exemple #4
0
    def __init__(self, input_space):
        """
        Args:
            input_space (Space): The input space
        """
        super().__init__()

        self.input_space = Space.make(input_space)

        # How many samples have we seen (after last reset)?
        self.sample_count = None
        # Current estimate of the mean.
        self.mean_est = None
        # Current estimate of the sum of stds.
        self.std_sum_est = None

        self.reset()
Exemple #5
0
    def __init__(self,
                 network,
                 *,
                 output_space,
                 adapters=None,
                 distributions=False,
                 deterministic=False,
                 input_space=None,
                 pre_concat_networks=None,
                 auto_flatten_inputs=True):
        """
        Args:
            network (Union[tf.keras.models.Model,tf.keras.layers.Layer,callable]): The neural network callable
                (w/o the final action-layer) for this function approximator.

            output_space (Space): The output Space (may be a ContainerSpace).

            adapters (dict):

            distributions (Union[Dict,bool,str]): Distribution specification for the different output components.
                Supported values are:
                Dict[str,any]: A dictionary, matching the output space's structure and specifying for each component,
                    what the distribution should be (or False/None for no distribution).
                bool: True if all components should have the default distribution according to their Space type.
                    False if no component should have a distribution.
                "default": See True.
                None: See False.
                Values of True/False/"default"/None may also be given inside a nested dict (see Dict above) for
                    specific components of the output space.

            deterministic (bool): Whether to sample (from possible distributions) deterministically.
                Default: False (stochastic sampling).

            input_space (Optional[Space]): Input space may be provided to ensure immediate build of the network (and
                its variables). Also, if it's a ContainerSpace, will build additional "pre-concat" NNs, through
                which input components are passed befor ebeing concat'd and sent further through the main NN.

            pre_concat_networks (Union[Dict,Tuple]): The neural network callable(s) for the different input
                components. Only applicable if `input_space` is given an a ContainerSpace.

            auto_flatten_inputs (bool): If True, will try to automatically flatten (or one-hot) all input components,
                but only if for that input-component, no `pre_concat_network` has been specified.
                For Int: One-hot along all non-main-axes. E.g. [[2, 3], [1, 2]] -> [0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0]
                For Float: Flatten along all non-main axes. E.g. [[2.0, 3.0], [1.0, 2.0]] -> [2.0 3.0 1.0 2.0]
                For Bool: Flatten along all non-main axes and convert to 0.0 (False) or 1.0 (True).
                Default: True.
        """
        super().__init__()

        # Store the given tf.keras.Model.
        self.network = network

        # Whether distribution outputs should be sampled deterministically.
        self.deterministic = deterministic

        # Create the output adapters.
        self.output_space = None
        self.flat_output_space = None
        # The adapters linking the main NN's output to the output layer(s)/distributions.
        self.adapters = []
        # The distributions to use (if any) for different components of the output space.
        self.distributions = []
        self._create_adapters_and_distributions(output_space, adapters,
                                                distributions)

        # Input space given explicitly.
        self.input_space = Space.make(
            input_space).with_batch() if input_space is not None else None
        self.flat_input_space = None
        self.pre_concat_networks = []  # One per input component.
        if self.input_space is not None:
            # If container space, build input NNs, then concat and connect to `self.network`.
            if isinstance(self.input_space, ContainerSpace):
                self._create_pre_concat_networks(pre_concat_networks,
                                                 auto_flatten_inputs)
            # Push through a sample to build our weights.
            self(self.input_space.sample())
Exemple #6
0
    def _create_adapters_and_distributions(self, output_space, adapters,
                                           distributions):
        if output_space is None:
            adapter = DistributionAdapter.make(adapters)
            self.output_space = adapter.output_space
            # Assert single component output space.
            assert isinstance(self.output_space, PrimitiveSpace), \
                "ERROR: Output space must not be ContainerSpace if no `output_space` is given in Network constructor!"
        else:
            self.output_space = Space.make(output_space)
        self.flat_output_space = tf.nest.flatten(self.output_space)

        # Find out whether we have a generic adapter-spec (one for all output components).
        generic_adapter_spec = None
        if isinstance(adapters,
                      dict) and not any(key in adapters
                                        for key in self.output_space):
            generic_adapter_spec = adapters
        # adapters may be incomplete (add Nones to non-defined leafs).
        elif isinstance(adapters, dict):
            adapters = complement_struct(adapters,
                                         reference_struct=self.output_space)
        flat_output_adapter_spec = flatten_alongside(
            adapters, alongside=self.output_space)

        # Find out whether we have a generic distribution-spec (one for all output components).
        generic_distribution_spec = None
        if isinstance(self.output_space, PrimitiveSpace) or \
                (isinstance(distributions, dict) and not any(key in distributions for key in self.output_space)):
            generic_distribution_spec = distributions
            flat_distribution_spec = tf.nest.map_structure(
                lambda s: distributions, self.flat_output_space)
        else:
            # adapters may be incomplete (add Nones to non-defined leafs).
            if isinstance(distributions, dict):
                distributions = complement_struct(
                    distributions, reference_struct=self.output_space)
            # No distributions whatsoever.
            elif not distributions:
                distributions = complement_struct(
                    {}, reference_struct=self.output_space)
            # Use default distributions (depending on output-space(s)).
            elif distributions is True or distributions == "default":
                distributions = complement_struct(
                    {}, reference_struct=self.output_space, value=True)
            flat_distribution_spec = tf.nest.flatten(distributions)

        # Figure out our Distributions.
        for i, output_component in enumerate(self.flat_output_space):
            # Generic spec -> Use it.
            if generic_adapter_spec:
                da_spec = copy.deepcopy(generic_adapter_spec)
                da_spec["output_space"] = output_component
            # Spec dict -> find setting in possibly incomplete spec.
            elif isinstance(adapters, dict):
                # If not specified in dict -> auto-generate AA-spec.
                da_spec = flat_output_adapter_spec[i]
                da_spec["output_space"] = output_component
            # Simple type spec.
            elif not isinstance(adapters, DistributionAdapter):
                da_spec = dict(output_space=output_component)
            # Direct object.
            else:
                da_spec = adapters

            # We have to get the type of the adapter from a distribution.
            if isinstance(da_spec, dict) and "type" not in da_spec:
                # Single distribution settings for all output components.
                if generic_distribution_spec is not None:
                    settings = {} if generic_distribution_spec in [
                        "default", True, False
                    ] else (generic_distribution_spec or {})
                else:
                    settings = flat_distribution_spec[i] if isinstance(
                        flat_distribution_spec[i], dict) else {}
                # `distributions` could be simply a direct spec dict.
                if (isinstance(settings, dict)
                        and "type" in settings) or isinstance(
                            settings, Distribution):
                    dist_spec = settings
                else:
                    dist_spec = get_default_distribution_from_space(
                        output_component, **settings)

                # No distribution.
                if not generic_distribution_spec and not flat_distribution_spec[
                        i]:
                    self.distributions.append(None)
                # Some distribution.
                else:
                    self.distributions.append(Distribution.make(dist_spec))
                    if self.distributions[-1] is None:
                        raise SurrealError(
                            "`output_component` is of type {} and not allowed in {} Component!"
                            .format(
                                type(output_space).__name__,
                                type(self).__name__))
                # Special case: No distribution AND float -> plain output adapter.
                if not generic_distribution_spec and \
                        (not flat_distribution_spec[i] and isinstance(da_spec["output_space"], Float)):
                    da_spec["type"] = "plain-output-adapter"
                # All other cases: Get adapter type from distribution spec
                # (even if we don't use a distribution in the end).
                else:
                    default_dict(
                        da_spec,
                        get_adapter_spec_from_distribution_spec(dist_spec))

                self.adapters.append(DistributionAdapter.make(da_spec))

            # da_spec is completely defined  -> Use it to get distribution.
            else:
                self.adapters.append(DistributionAdapter.make(da_spec))
                if distributions[i]:
                    dist_spec = get_distribution_spec_from_adapter(
                        self.adapters[-1])
                    self.distributions.append(Distribution.make(dist_spec))
Exemple #7
0
    def __init__(
            self,
            *,
            policy_network,
            q_network,
            state_space,
            action_space,
            sac_config,
            num_q_experts=4,  # 4 used in paper.
            q_predicts_states_diff=False,
            num_denominator_samples_for_ri=250,  # 50-500 used in paper
            dim_skill_vectors=10,
            discrete_skills=False,
            episode_horizon=200,
            skill_horizon=None,
            preprocessor=None,
            supervised_optimizer=None,
            num_steps_per_supervised_update=1,
            episode_buffer_capacity=200,
            summaries=None):
        """
        Args:
            policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy.

            q_network (Network): The dynamics-network (q) to use as a function approximator for the learnt env
                dynamics. NOTE: Not to be confused with a Q-learning Q-net! In the paper, the dynamics function is
                called `q`, hence the same nomenclature here.

            state_space (Space): The state/observation Space.
            action_space (Space): The action Space.
            sac_config (SACConfig): The config for the internal SAC-Algo used to learn the skills using intrinsic rewards.

            num_q_experts (int): The number of experts used in the Mixture distribution output bz the q-network to
                predict the next state (s') given s (state) and z (skill vector).

            q_predicts_states_diff (bool): Whether the q-network predicts the different between s and s' rather than
                s' directly. Default: False.

            num_denominator_samples_for_ri (int): The number of samples to calculate for the denominator of the
                intrinsic reward function (`L` in the paper).

            dim_skill_vectors (int): The number of dimensions of the learnt skill vectors.
            discrete_skills (bool): Whether skill vectors are discrete (one-hot).
            episode_horizon (int): The episode horizon (He) to move within, when gathering episode samples.

            skill_horizon (Optional[int]): The horizon for which to use one skill vector (before sampling a new one).
                Default: Use value of `episode_horizon`.

            preprocessor (Preprocessor): The preprocessor (if any) to use.
            supervised_optimizer (Optimizer): The optimizer to use for the supervised (q) model learning task.

            num_steps_per_supervised_update (int): The number of gradient descent iterations per update
                (each iteration uses the same environment samples).

            episode_buffer_capacity (int): The capacity of the episode (experience) FIFOBuffer.

            summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true.
                In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should
                be tracked after each tick.
        """
        # Clean up network configs to be passable as **kwargs to `make`.
        # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec.
        if isinstance(
                policy_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            policy_network = dict(network=policy_network)
        if isinstance(
                q_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            q_network = dict(network=q_network)

        # Make state/action space.
        state_space = Space.make(state_space)
        action_space = Space.make(action_space)

        # Fix SAC config, add correct state- and action-spaces.
        sac_config = SACConfig.make(
            sac_config,
            state_space=Dict(s=state_space,
                             z=Float(-1.0, 1.0, shape=(dim_skill_vectors, ))),
            action_space=action_space,
            # Use no memory. Updates are done from DADS' own buffer.
            memory_capacity=1,
            memory_batch_size=1,
            # Share policy network between DADS and underlying learning SAC.
            policy_network=policy_network)

        if skill_horizon is None:
            skill_horizon = episode_horizon

        super().__init__(
            locals())  # Config will store all c'tor variables automatically.

        # Keep track of which time-step stuff happened. Only important for by-time-step frequencies.
        self.last_update = 0
Exemple #8
0
    def __init__(
            self, *,
            q_network, state_space, action_space,
            policy_network=None,
            preprocessor=None,
            default_optimizer=None, q_optimizer=None, policy_optimizer=None, alpha_optimizer=None,
            optimize_alpha=True,
            bounded_distribution_type="squashed-normal", gumbel_softmax_temperature=1.0,
            gamma=0.99,
            num_q_networks=2,
            memory_capacity=10000, memory_batch_size=256,
            use_prioritized_replay=False, memory_alpha=1.0, memory_beta=0.0,
            initial_alpha=1.0, entropy_target=None,  # default: -dim(A), but this won't work for Atari.
            n_step=1,
            max_time_steps=None, update_after=0, update_frequency=1, num_steps_per_update=1,
            sync_frequency=1, sync_tau=0.005,
            time_unit="time_step",
            summaries=None
    ):
        """
        Args:
            q_network (Network): The Q-network to use as a function approximator for the learnt Q-function.
            state_space (Space): The state/observation Space.
            action_space (Space): The action Space.

            policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy.
                Default: Use the same setup as the q-network(s).

            preprocessor (Preprocessor): The preprocessor (if any) to use.
            default_optimizer (Optimizer): The optimizer to use for any Q/pi/alpha, which don't have their own defined.
            q_optimizer (Optimizer): The optimizer to use for the Q-network. If None, use `optimizer`.
            policy_optimizer (Optimizer): The optimizer to use for the policy (pi). If None, use `optimizer`.
            alpha_optimizer (Optimizer): The optimizer to use for the alpha parameter. If None, use `optimizer`.

            optimize_alpha (bool): Whether to use the alpha loss term and an optimizer step to update alpha. False
                for keeping alpha constant at `initial_alpha`.

            bounded_distribution_type (str): Which distribution type to use for continuous, bounded output spaces.
                Must be a Distribution class type string. See components/distributions/__init__.py

            gumbel_softmax_temperature (float): Iff `discrete_distribution_type`="gumbel-softmax" (which is fixed and
                required for SAC), which temperature parameter to use.

            gamma (float): The discount factor (gamma).
            memory_capacity (int): The memory's capacity (max number of records to store).
            memory_batch_size (int): The batch size to use for updating from memory.
            use_prioritized_replay (bool): Whether to use a PrioritizedReplayBuffer (instead of a plain ReplayBuffer).
            memory_alpha (float): The alpha value for the PrioritizedReplayBuffer.
            memory_beta (float): The beta value for the PrioritizedReplayBuffer.

            initial_alpha (float): The initial value for alpha (before optimization).
            entropy_target (float): The value of "Hbar" in the loss function for alpha. Default is -dim(A).
            n_step (int): The number of steps (n) to "look ahead/back" when converting 1-step tuples into n-step ones.

            #n_step_only (bool): Whether to exclude samples that are shorter than `n_step` AND don't have a terminal
            #    at the end.

            max_time_steps (Optional[int]): The maximum number of time steps (across all actors) to learn/update.
                If None, use a value given by the environment.

            update_after (Union[int,str]): The `time_unit`s to wait before starting any updates.
                Special values (only valid iff time_unit == "time_step"!):
                - "when-memory-full" for same as `memory_capacity`.
                - when-memory-ready" for same as `memory_batch_size`.

            update_frequency (int): The frequency (in `time_unit`) with which to update our Q-network.

            num_steps_per_update (int): The number of gradient descent iterations per update (each iteration uses
                a different sample).

            sync_frequency (int): The frequency (in `time_unit`) with which to sync our target network.
            sync_tau (float): The target smoothing coefficient with which to synchronize the target Q-network.
            time_unit (str["time_step","env_tick"]): The time units we are using for update/sync decisions.

            summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true.
                In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should
                be tracked after each tick.
        """
        # If one not given, use a copy of the other NN and make sure the given network is not a done Keras object yet.
        if policy_network is None:
            assert isinstance(q_network, (dict, list, tuple))
            policy_network = q_network
        elif q_network is None:
            assert isinstance(policy_network, (dict, list, tuple))
            q_network = policy_network

        # Clean up network configs to be passable as **kwargs to `make`.
        # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec.
        if isinstance(q_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            q_network = dict(network=q_network)
        if isinstance(policy_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            policy_network = dict(network=policy_network)

        # Make sure our optimizers are defined ok.
        if default_optimizer is None:
            assert q_optimizer and policy_optimizer and alpha_optimizer
        if q_optimizer and policy_optimizer and alpha_optimizer:
            if default_optimizer:
                logging.warning(
                    "***WARNING: `default_optimizer` defined, but has no effect b/c `q_optimizer`, `policy_optimizer` "
                    "and `alpha_optimizer` are already provided!"
                )
        if q_optimizer is None:
            q_optimizer = default_optimizer
        if policy_optimizer is None:
            policy_optimizer = default_optimizer
        if alpha_optimizer is None:
            alpha_optimizer = default_optimizer

        assert time_unit in ["time_step", "env_tick"]

        # Special value for start-train parameter -> When memory full.
        if update_after == "when-memory-full":
            update_after = memory_capacity
        # Special value for start-train parameter -> When memory has enough records to pull a batch.
        elif update_after == "when-memory-ready":
            update_after = memory_batch_size
        assert isinstance(update_after, int)

        # Make sure sync-freq >= update-freq:
        assert sync_frequency >= update_frequency
        # Make sure memory batch size is less than capacity.
        assert memory_batch_size <= memory_capacity

        # Derive memory_spec for SAC c'tor.
        # If PR -> Check that alpha is not 0.0.
        if use_prioritized_replay is True:
            if memory_alpha == 0.0:
                logging.warning(
                    "***WARNING: `use_prioritized_replay` is True, but memory's alpha is set to 0.0 (which implies no "
                    "prioritization whatsoever)!"
                )
            memory_spec = dict(type="prioritized-replay-buffer", alpha=memory_alpha, beta=memory_beta)
        else:
            memory_spec = dict(type="replay-buffer")
        memory_spec["capacity"] = memory_capacity
        memory_spec["next_record_setup"] = dict(s="s_", n_step=n_step)  # setup: s' is next-record of s (after n-steps).

        # Make action space.
        action_space = Space.make(action_space)

        # Default Hbar: -dim(A) (according to the paper).
        if entropy_target is None:
            entropy_target = -(action_space.flat_dim_with_categories if isinstance(action_space, Int) else
                               action_space.flat_dim)
            print("entropy_target={}".format(entropy_target))

        super().__init__(locals())  # Config will store all c'tor variables automatically.

        # Keep track of which time-step stuff happened. Only important for by-time-step frequencies.
        self.last_update = 0
        self.last_sync = 0