コード例 #1
0
    def select_action(self, agent: str,
                      observation: types.NestedArray) -> types.NestedArray:
        """select an action for a single agent in the system

        Args:
            agent (str): agent id
            observation (types.NestedArray): observation tensor received from the
                environment.

        Returns:
            types.NestedArray: action and policy.
        """

        # TODO Mask actions here using observation.legal_actions
        # Initialize the RNN state if necessary.
        if self._states[agent] is None:
            # index network either on agent type or on agent id
            agent_key = agent.split("_")[0] if self._shared_weights else agent
            self._states[agent] = self._policy_networks[
                agent_key].initia_state(1)

        # Step the recurrent policy forward given the current observation and state.
        action, policy, new_state = self._policy(agent,
                                                 observation.observation,
                                                 self._states[agent])

        # Bookkeeping of recurrent states for the observe method.
        self._update_state(agent, new_state)

        # Return a numpy array with squeezed out batch dimension.
        action = tf2_utils.to_numpy_squeeze(action)
        policy = tf2_utils.to_numpy_squeeze(policy)
        return action, policy
コード例 #2
0
ファイル: execution.py プロジェクト: NetColby/DNRL
    def select_action(self, agent: str,
                      observation: types.NestedArray) -> types.NestedArray:
        """select an action for a single agent in the system

        Args:
            agent (str): agent id.
            observation (types.NestedArray): observation tensor received from the
                environment.

        Returns:
            types.NestedArray: agent action
        """

        # Step the recurrent policy/value network forward
        # given the current observation and state.
        self._prev_log_probs[agent], action = self._policy(agent, observation)

        # Return a numpy array with squeezed out batch dimension.
        action = tf2_utils.to_numpy_squeeze(action)

        # TODO(Kale-ab) : Remove. This is for debugging.
        if np.isnan(action).any():
            print(
                f"Value error- Log Probs:{self._prev_log_probs[agent]} Action: {action} "  # noqa: E501
            )

        return action
コード例 #3
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_obs = tf2_utils.add_batch_dim(observation)

        # Initialize the RNN state if necessary.
        if self._state is None:
            self._state = self._network.initial_state(1)

        # Forward.
        policy_output, new_state = self._policy(batched_obs, self._state)

        # If the policy network parameterises a distribution, sample from it.
        def maybe_sample(output):
            if isinstance(output, tfd.Distribution):
                output = output.sample()
            return output

        policy_output = tree.map_structure(maybe_sample, policy_output)

        self._prev_state = self._state
        self._state = new_state

        # Convert to numpy and squeeze out the batch dimension.
        action = tf2_utils.to_numpy_squeeze(policy_output)

        return action
コード例 #4
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Pass the observation through the policy network.
        action = self._policy(observation)

        # Return a numpy array with squeezed out batch dimension.
        return tf2_utils.to_numpy_squeeze(action)
コード例 #5
0
    def observe(self, action: types.NestedArray,
                next_timestep: dm_env.TimeStep):
        if not self._adder:
            return

        numpy_state = tf2_utils.to_numpy_squeeze(self._prev_state)
        self._adder.add(action, next_timestep, extras=(numpy_state, ))
コード例 #6
0
    def observe(
        self,
        action: types.NestedArray,
        next_timestep: dm_env.TimeStep,
    ):
        if not self._adder:
            return

        extras = {'logits': self._prev_logits, 'core_state': self._prev_state}
        extras = tf2_utils.to_numpy_squeeze(extras)
        self._adder.add(action, next_timestep, extras)
コード例 #7
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_obs = tf2_utils.add_batch_dim(observation)

        # Forward the policy network.
        action = self._policy(batched_obs)

        # Convert to numpy and squeeze out the batch dimension.
        action = tf2_utils.to_numpy_squeeze(action)

        return action
コード例 #8
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_observation = tf2_utils.add_batch_dim(observation)

        # Compute the policy, conditioned on the observation.
        policy = self._policy_network(batched_observation)
        if self._deterministic_policy:
            action = policy.mean()
        else:
            action = policy.sample()
        self._log_prob = policy.log_prob(action)
        return tf2_utils.to_numpy_squeeze(action)
コード例 #9
0
    def select_action(
        self, agent: str, observation: types.NestedArray
    ) -> Tuple[types.NestedArray, types.NestedArray]:
        """select an action for a single agent in the system

        Args:
            agent (str): agent id.
            observation (types.NestedArray): observation tensor received from the
                environment.

        Returns:
            Tuple[types.NestedArray, types.NestedArray]: agent action and policy.
        """

        # Step the recurrent policy/value network forward
        # given the current observation and state.
        action, policy = self._policy(agent, observation.observation)

        # Return a numpy array with squeezed out batch dimension.
        action = tf2_utils.to_numpy_squeeze(action)
        policy = tf2_utils.to_numpy_squeeze(policy)
        return action, policy
コード例 #10
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_observation = tf2_utils.add_batch_dim(observation)

        # Compute the policy, conditioned on the observation.
        action, policy, log_prob = self._policy_network.getAll(
            batched_observation)

        self._prev_logP = log_prob
        self._prev_means = policy

        # Return a numpy array with squeezed out batch dimension.
        return tf2_utils.to_numpy_squeeze(action)
コード例 #11
0
ファイル: actors.py プロジェクト: staylonging/acme
  def select_action(self, observation: types.NestedArray) -> types.NestedArray:
    # Initialize the RNN state if necessary.
    if self._state is None:
      self._state = self._network.initial_state(1)

    # Step the recurrent policy forward given the current observation and state.
    policy_output, new_state = self._policy(observation, self._state)

    # Bookkeeping of recurrent states for the observe method.
    self._prev_state = self._state
    self._state = new_state

    # Return a numpy array with squeezed out batch dimension.
    return tf2_utils.to_numpy_squeeze(policy_output)
コード例 #12
0
  def select_action(self, observation: types.NestedArray) -> types.NestedArray:
    # Add a dummy batch dimension and as a side effect convert numpy to TF.
    batched_obs = tf2_utils.add_batch_dim(observation)

    if self._state is None:
      self._state = self._network.initial_state(1)

    # Forward.
    (logits, _), new_state = self._policy(batched_obs, self._state)

    self._prev_logits = logits
    self._prev_state = self._state
    self._state = new_state

    action = tfd.Categorical(logits).sample()
    action = tf2_utils.to_numpy_squeeze(action)

    return action
コード例 #13
0
    def select_actions(
            self, observations: Dict[str,
                                     OLT]) -> Dict[str, types.NestedArray]:
        """select the actions for all agents in the system

        Args:
            observations (Dict[str, OLT]): transition object containing observations,
                legal actions and terminals.

        Returns:
            Dict[str, types.NestedArray]: actions for all agents in the system.
        """

        actions = {}
        for agent, observation in observations.items():
            # Pass the observation through the policy network.
            if not self._evaluator:
                epsilon = self._trainer.get_epsilon()
            else:
                # Note (dries): For some reason 0 epsilon breaks on StarCraft.
                epsilon = 1e-10

            epsilon = tf.convert_to_tensor(epsilon)

            if self._fingerprint:
                trainer_step = self._trainer.get_trainer_steps()
                fingerprint = tf.concat([epsilon, trainer_step], axis=0)
                fingerprint = tf.expand_dims(fingerprint, axis=0)
                fingerprint = tf.cast(fingerprint, "float32")
            else:
                fingerprint = None

            action = self._policy(
                agent,
                observation.observation,
                observation.legal_actions,
                epsilon,
                fingerprint,
            )

            actions[agent] = tf2_utils.to_numpy_squeeze(action)

        # Return a numpy array with squeezed out batch dimension.
        return actions
コード例 #14
0
    def select_actions(
            self, observations: Dict[str,
                                     OLT]) -> Dict[str, types.NestedArray]:
        """select the actions for all agents in the system

        Args:
            observations (Dict[str, OLT]): transition object containing observations,
                legal actions and terminals.

        Returns:
            Dict[str, types.NestedArray]: actions for all agents in the system.
        """

        actions = {}

        message_inputs = self._communication_module.process_messages(
            self._messages)

        for agent, observation in observations.items():

            # Pass the observation through the policy network.
            if self._trainer is not None:
                epsilon = self._trainer.get_epsilon()
            else:
                epsilon = 0.0

            epsilon = tf.convert_to_tensor(epsilon)

            (policy_output, new_message), new_state = self._policy(
                agent,
                observation.observation,
                self._states[agent],
                message_inputs[agent],
                observation.legal_actions,
                epsilon,
            )

            self._states[agent] = new_state
            self._messages[agent] = new_message

            actions[agent] = tf2_utils.to_numpy_squeeze(policy_output)

        # Return a numpy array with squeezed out batch dimension.
        return actions
コード例 #15
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_obs = tf2_utils.add_batch_dim(observation)

        # Initialize the RNN state if necessary.
        if self._state is None:
            self._state = self._network.initial_state(1)

        # Forward.
        policy_output, new_state = self._policy(batched_obs, self._state)

        self._prev_state = self._state
        self._state = new_state

        # Convert to numpy and squeeze out the batch dimension.
        action = tf2_utils.to_numpy_squeeze(policy_output)

        return action
コード例 #16
0
    def select_action(self,
                      observation: types.NestedArray) -> types.NestedArray:
        # Add a dummy batch dimension and as a side effect convert numpy to TF.
        batched_obs = tf2_utils.add_batch_dim(observation)

        # Forward the policy network.
        policy_output = self._policy_network(batched_obs)

        # If the policy network parameterises a distribution, sample from it.
        def maybe_sample(output):
            if isinstance(output, tfd.Distribution):
                output = output.sample()
            return output

        policy_output = tree.map_structure(maybe_sample, policy_output)

        # Convert to numpy and squeeze out the batch dimension.
        action = tf2_utils.to_numpy_squeeze(policy_output)

        return action
コード例 #17
0
    def select_action2(self, observation: types.NestedArray,
                       mask: types.NestedArray) -> types.NestedArray:
        # Initialize the RNN state if necessary.
        if self._state is None:
            self._state = self._network.initial_state(1)

        # Step the recurrent policy forward given the current observation and state.
        policy_output, new_state = self._policy(observation, self._state, mask)
        #counter=0
        #while mask[policy_output]==0 and counter<1:
        #    policy_output, new_state = self._policy(observation, self._state, mask)
        #    counter+=1
        #if counter==1:
        #    print("Valid actions are hard to find here! ->"+str(set(mask)))
        # Bookkeeping of recurrent states for the observe method.
        self._prev_state = self._state
        self._state = new_state

        # Return a numpy array with squeezed out batch dimension.
        return tf2_utils.to_numpy_squeeze(policy_output)
コード例 #18
0
    def select_action(self, agent: str,
                      observation: types.NestedArray) -> types.NestedArray:
        """select an action for a single agent in the system

        Args:
            agent (str): agent id
            observation (types.NestedArray): observation tensor received from the
                environment.

        Returns:
            types.NestedArray: agent action
        """

        if not self._evaluator:
            epsilon = self._trainer.get_epsilon()
        else:
            epsilon = 1e-10

        epsilon = tf.convert_to_tensor(epsilon)

        if self._fingerprint:
            trainer_step = self._trainer.get_trainer_steps()
            fingerprint = tf.concat([epsilon, trainer_step], axis=0)
            fingerprint = tf.expand_dims(fingerprint, axis=0)
            fingerprint = tf.cast(fingerprint, "float32")
        else:
            fingerprint = None

        action = self._policy(
            agent,
            observation.observation,
            observation.legal_actions,
            epsilon,
            fingerprint,
        )

        action = tf2_utils.to_numpy_squeeze(action)

        return action
コード例 #19
0
    def observe(
        self,
        actions: Dict[str, types.NestedArray],
        next_timestep: dm_env.TimeStep,
        next_extras: Optional[Dict[str, types.NestedArray]] = {},
    ) -> None:
        """record observed timestep from the environment

        Args:
            actions (Dict[str, types.NestedArray]): system agents' actions.
            next_timestep (dm_env.TimeStep): data emitted by an environment during
                interaction.
            next_extras (Dict[str, types.NestedArray], optional): possible extra
                information to record during the transition. Defaults to {}.
        """

        if not self._adder:
            return
        _, policy = actions
        if not self._store_recurrent_state:
            if next_extras:
                # TODO (dries): Sort out this mypy issue.
                self._adder.add(policy, next_timestep,
                                next_extras)  # type: ignore
            else:
                self._adder.add(policy, next_timestep)  # type: ignore
            return

        numpy_states = {
            agent: tf2_utils.to_numpy_squeeze(_state)
            for agent, _state in self._states.items()
        }
        if next_extras:
            next_extras.update({"core_states": numpy_states})
            self._adder.add(policy, next_timestep, next_extras)  # type: ignore
        else:
            self._adder.add(policy, next_timestep,
                            numpy_states)  # type: ignore
コード例 #20
0
ファイル: execution.py プロジェクト: NetColby/DNRL
    def observe(
        self,
        actions: Dict[str, types.NestedArray],
        next_timestep: dm_env.TimeStep,
        next_extras: Dict[str, types.NestedArray] = {},
    ) -> None:
        """record observed timestep from the environment

        Args:
            actions (Dict[str, types.NestedArray]): system agents' actions.
            next_timestep (dm_env.TimeStep): data emitted by an environment during
                interaction.
            next_extras (Dict[str, types.NestedArray], optional): possible extra
                information to record during the transition. Defaults to {}.
        """

        if not self._adder:
            return

        next_extras.update({"log_probs": self._prev_log_probs})

        next_extras = tf2_utils.to_numpy_squeeze(next_extras)

        self._adder.add(actions, next_timestep, next_extras)
コード例 #21
0
ファイル: agent.py プロジェクト: dzorlu/acme
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.RNNCore,
        target_network: snt.RNNCore,
        burn_in_length: int,
        trace_length: int,
        replay_period: int,
        demonstration_generator: iter,
        demonstration_ratio: float,
        model_directory: str,
        counter: counting.Counter = None,
        logger: loggers.Logger = None,
        discount: float = 0.99,
        batch_size: int = 32,
        target_update_period: int = 100,
        importance_sampling_exponent: float = 0.2,
        epsilon: float = 0.01,
        learning_rate: float = 1e-3,
        log_to_bigtable: bool = False,
        log_name: str = 'agent',
        checkpoint: bool = True,
        min_replay_size: int = 1000,
        max_replay_size: int = 1000000,
        samples_per_insert: float = 32.0,
    ):

        extra_spec = {
            'core_state': network.initial_state(1),
        }
        # replay table
        # Remove batch dimensions.
        extra_spec = tf2_utils.squeeze_batch_dim(extra_spec)
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Prioritized(0.8),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1),
            signature=adders.SequenceAdder.signature(environment_spec,
                                                     extra_spec))
        # demonstation table.
        demonstration_table = reverb.Table(
            name='demonstration_table',
            sampler=reverb.selectors.Prioritized(0.8),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1),
            signature=adders.SequenceAdder.signature(environment_spec,
                                                     extra_spec))

        # launch server
        self._server = reverb.Server([replay_table, demonstration_table],
                                     port=None)
        address = f'localhost:{self._server.port}'

        sequence_length = burn_in_length + trace_length + 1

        # Component to add things into replay and demo
        sequence_kwargs = dict(
            period=replay_period,
            sequence_length=sequence_length,
        )
        adder = adders.SequenceAdder(client=reverb.Client(address),
                                     **sequence_kwargs)
        priority_function = {demonstration_table.name: lambda x: 1.}
        demo_adder = adders.SequenceAdder(client=reverb.Client(address),
                                          priority_fns=priority_function,
                                          **sequence_kwargs)
        # play demonstrations and write
        # exhaust the generator
        # TODO: MAX REPLAY SIZE
        _prev_action = 1  # this has to come from spec
        _add_first = True
        #include this to make datasets equivalent
        numpy_state = tf2_utils.to_numpy_squeeze(network.initial_state(1))
        for ts, action in demonstration_generator:
            if _add_first:
                demo_adder.add_first(ts)
                _add_first = False
            else:
                demo_adder.add(_prev_action, ts, extras=(numpy_state, ))
            _prev_action = action
            # reset to new episode
            if ts.last():
                _prev_action = None
                _add_first = True

        # replay dataset
        max_in_flight_samples_per_worker = 2 * batch_size if batch_size else 100
        dataset = reverb.ReplayDataset.from_table_signature(
            server_address=address,
            table=adders.DEFAULT_PRIORITY_TABLE,
            max_in_flight_samples_per_worker=max_in_flight_samples_per_worker,
            num_workers_per_iterator=
            2,  # memory perf improvment attempt  https://github.com/deepmind/acme/issues/33
            sequence_length=sequence_length,
            emit_timesteps=sequence_length is None)

        # demonstation dataset
        d_dataset = reverb.ReplayDataset.from_table_signature(
            server_address=address,
            table=demonstration_table.name,
            max_in_flight_samples_per_worker=max_in_flight_samples_per_worker,
            num_workers_per_iterator=2,
            sequence_length=sequence_length,
            emit_timesteps=sequence_length is None)

        dataset = tf.data.experimental.sample_from_datasets(
            [dataset, d_dataset],
            [1 - demonstration_ratio, demonstration_ratio])

        # Batch and prefetch.
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        tf2_utils.create_variables(network, [environment_spec.observations])
        tf2_utils.create_variables(target_network,
                                   [environment_spec.observations])

        learner = learning.R2D2Learner(
            environment_spec=environment_spec,
            network=network,
            target_network=target_network,
            burn_in_length=burn_in_length,
            dataset=dataset,
            reverb_client=reverb.TFClient(address),
            counter=counter,
            logger=logger,
            sequence_length=sequence_length,
            discount=discount,
            target_update_period=target_update_period,
            importance_sampling_exponent=importance_sampling_exponent,
            max_replay_size=max_replay_size,
            learning_rate=learning_rate,
            store_lstm_state=False,
        )

        self._checkpointer = tf2_savers.Checkpointer(
            directory=model_directory,
            subdirectory='r2d2_learner_v1',
            time_delta_minutes=15,
            objects_to_save=learner.state,
            enable_checkpointing=checkpoint,
        )

        self._snapshotter = tf2_savers.Snapshotter(objects_to_save=None,
                                                   time_delta_minutes=15000.,
                                                   directory=model_directory)

        policy_network = snt.DeepRNN([
            network,
            lambda qs: trfl.epsilon_greedy(qs, epsilon=epsilon).sample(),
        ])

        actor = actors.RecurrentActor(policy_network, adder)
        observations_per_step = (float(replay_period * batch_size) /
                                 samples_per_insert)
        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=replay_period *
                         max(batch_size, min_replay_size),
                         observations_per_step=observations_per_step)
コード例 #22
0
 def observe(self, action: types.NestedArray,
             next_timestep: dm_env.TimeStep):
     extras = {'logP': self._prev_logP, 'policy': self._prev_means}
     extras = tf2_utils.to_numpy_squeeze(extras)
     self._adder.add(action, next_timestep, extras)