예제 #1
0
def main(_):
    # Create an environment and create the spec.
    environment, environment_spec = _build_environment(
        FLAGS.environment_name, max_steps=FLAGS.max_steps_per_episode)

    if FLAGS.model_name:
        loaded_network = load_wb_model(FLAGS.model_name, FLAGS.model_tag)

        if FLAGS.stochastic:
            head = networks.StochasticSamplingHead()
        else:
            head = lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon
                                                 ).sample()

        policy_network = snt.Sequential([
            loaded_network,
            head,
        ])
        actor = actors.FeedForwardActor(policy_network)

    else:
        actor = RandomActor(environment_spec)

    recorder = DemonstrationRecorder(environment, actor)

    recorder.collect_n_episodes(FLAGS.n_episodes)
    recorder.make_tf_dataset()
    recorder.save(FLAGS.save_dir)
예제 #2
0
파일: agent.py 프로젝트: vishalbelsare/acme
  def make_actor(
      self,
      policy_network: snt.Module,
      adder: Optional[adders.Adder] = None,
      variable_source: Optional[core.VariableSource] = None,
  ):
    """Create an actor instance."""
    if variable_source:
      # Create the variable client responsible for keeping the actor up-to-date.
      variable_client = variable_utils.VariableClient(
          client=variable_source,
          variables={'policy': policy_network.variables},
          update_period=1000,
      )

      # Make sure not to use a random policy after checkpoint restoration by
      # assigning variables before running the environment loop.
      variable_client.update_and_wait()

    else:
      variable_client = None

    # Create the actor which defines how we take actions.
    return actors.FeedForwardActor(
        policy_network=policy_network,
        adder=adder,
        variable_client=variable_client,
    )
예제 #3
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ) -> acme.EnvironmentLoop:
        """The actor process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and target networks to act with.
        environment = self._environment_factory(False)
        agent_networks = self._network_factory(action_spec,
                                               self._num_critic_heads)

        # Make sure observation network is defined.
        observation_network = agent_networks.get('observation', tf.identity)

        # Create a stochastic behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            agent_networks['policy'],
            networks.StochasticSamplingHead(),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(behavior_network, [observation_spec])
        policy_variables = {'policy': behavior_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(variable_source,
                                                            policy_variables,
                                                            update_period=1000)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(
            client=replay,
            n_step=self._n_step,
            max_in_flight_items=self._max_in_flight_items,
            discount=self._additional_discount)

        # Create the agent.
        actor = actors.FeedForwardActor(policy_network=behavior_network,
                                        adder=adder,
                                        variable_client=variable_client)

        # Create logger and counter; actors will not spam bigtable.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             time_delta=self._log_every,
                                             steps_key='actor_steps')

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, actor, counter, logger)
예제 #4
0
파일: run_sac.py 프로젝트: novatig/acme
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = make_environment(FLAGS.task_name)
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = make_networks(environment_spec)

    # Construct the agent.
    agent = sac.SAC(
        environment_spec=environment_spec,
        policy_network=agent_networks['policy'],
        critic_network=agent_networks['critic'],
        encoder_network=agent_networks['observation'],
        #sigma=0.3,  # pytype: disable=wrong-arg-types
    )

    # Create the environment loop used for training.
    train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop')

    # Create the evaluation policy.
    eval_policy = agent.behavior_network

    # Create the evaluation actor and loop.
    eval_actor = actors.FeedForwardActor(policy_network=eval_policy)
    eval_env = make_environment(FLAGS.task_name)
    eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop')

    for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval):
        train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
예제 #5
0
파일: run_d4pg.py 프로젝트: pchtsp/acme
def main(_):
  # Create an environment, grab the spec, and use it to create networks.
  environment = make_environment()
  environment_spec = specs.make_environment_spec(environment)
  agent_networks = make_networks(environment_spec.actions)

  # Construct the agent.
  agent = d4pg.D4PG(
      environment_spec=environment_spec,
      policy_network=agent_networks['policy'],
      critic_network=agent_networks['critic'],
      observation_network=agent_networks['observation'],  # pytype: disable=wrong-arg-types
  )

  # Create the environment loop used for training.
  train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop')

  # Create the evaluation policy.
  eval_policy = snt.Sequential([
      agent_networks['observation'],
      agent_networks['policy'],
  ])

  # Create the evaluation actor and loop.
  eval_actor = actors.FeedForwardActor(policy_network=eval_policy)
  eval_env = make_environment()
  eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop')

  for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval):
    train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
    eval_loop.run(num_episodes=1)
예제 #6
0
파일: run_bcq.py 프로젝트: deepmind/acme
def main(_):
    # Create an environment and grab the spec.
    environment = atari.environment(FLAGS.game)
    environment_spec = specs.make_environment_spec(environment)

    # Create dataset.
    dataset = atari.dataset(path=FLAGS.dataset_path,
                            game=FLAGS.game,
                            run=FLAGS.run,
                            num_shards=FLAGS.num_shards)
    # Discard extra inputs
    dataset = dataset.map(lambda x: x._replace(data=x.data[:5]))

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Build network.
    g_network = make_network(environment_spec.actions)
    q_network = make_network(environment_spec.actions)
    network = networks.DiscreteFilteredQNetwork(g_network=g_network,
                                                q_network=q_network,
                                                threshold=FLAGS.bcq_threshold)
    tf2_utils.create_variables(network, [environment_spec.observations])

    evaluator_network = snt.Sequential([
        q_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Counters.
    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluation_network = actors.FeedForwardActor(evaluator_network)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluation_network,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = bcq.DiscreteBCQLearner(
        network=network,
        dataset=dataset,
        learning_rate=FLAGS.learning_rate,
        discount=FLAGS.discount,
        importance_sampling_exponent=FLAGS.importance_sampling_exponent,
        target_update_period=FLAGS.target_update_period,
        counter=counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
예제 #7
0
  def evaluator(
      self,
      variable_source: acme.VariableSource,
      counter: counting.Counter,
  ):
    """The evaluation process."""

    action_spec = self._environment_spec.actions
    observation_spec = self._environment_spec.observations

    # Create environment and target networks to act with.
    environment = self._environment_factory(True)
    agent_networks = self._network_factory(action_spec)

    # Make sure observation network is defined.
    observation_network = agent_networks.get('observation', tf.identity)

    # Create a stochastic behavior policy.
    evaluator_network = snt.Sequential([
        observation_network,
        agent_networks['policy'],
        networks.StochasticMeanHead(),
    ])

    # Ensure network variables are created.
    tf2_utils.create_variables(evaluator_network, [observation_spec])
    policy_variables = {'policy': evaluator_network.variables}

    # Create the variable client responsible for keeping the actor up-to-date.
    variable_client = tf2_variable_utils.VariableClient(
        variable_source,
        policy_variables,
        update_period=self._variable_update_period)

    # Make sure not to evaluate a random actor by assigning variables before
    # running the environment loop.
    variable_client.update_and_wait()

    # Create the agent.
    evaluator = actors.FeedForwardActor(
        policy_network=evaluator_network, variable_client=variable_client)

    # Create logger and counter.
    counter = counting.Counter(counter, 'evaluator')
    logger = loggers.make_default_logger(
        'evaluator', time_delta=self._log_every, steps_key='evaluator_steps')
    observers = self._make_observers() if self._make_observers else ()

    # Create the run loop and return it.
    return acme.EnvironmentLoop(
        environment,
        evaluator,
        counter,
        logger,
        observers=observers)
예제 #8
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The actor process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and behavior networks
        environment = self._environment_factory(False)
        agent_networks = self._network_factory(action_spec)

        # Create behavior network by adding some random dithering.
        behavior_network = snt.Sequential([
            agent_networks.get('observation', tf.identity),
            agent_networks.get('policy'),
            networks.ClippedGaussian(self._sigma),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(behavior_network, [observation_spec])
        variables = {'policy': behavior_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables,
            update_period=self._variable_update_period)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(client=replay,
                                            n_step=self._n_step,
                                            discount=self._discount)

        # Create the agent.
        actor = actors.FeedForwardActor(behavior_network,
                                        adder=adder,
                                        variable_client=variable_client)

        # Create logger and counter; actors will not spam bigtable.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             time_delta=self._log_every,
                                             steps_key='actor_steps')

        # Create the loop to connect environment and agent.
        return acme.EnvironmentLoop(environment, actor, counter, logger)
예제 #9
0
def main(_):
    wb_run = init_or_resume()

    if FLAGS.seed:
        tf.random.set_seed(FLAGS.seed)

    # Create an environment and grab the spec.
    environment, env_spec = _build_environment(FLAGS.environment_name)

    # Load demonstration dataset.
    raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir)

    dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size,
                                 FLAGS.n_step_returns, FLAGS.discount)

    # Create the policy and critic networks.
    policy_network = networks.get_default_critic(env_spec)

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [env_spec.observations])

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_network = snt.Sequential([
        policy_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Create the actor which defines how we take actions.
    evaluation_actor = actors.FeedForwardActor(evaluator_network)

    counter = counting.Counter()

    disp, disp_loop = _build_custom_loggers(wb_run)

    eval_loop = EnvironmentLoop(environment=environment,
                                actor=evaluation_actor,
                                counter=counter,
                                logger=disp_loop)

    # The learner updates the parameters (and initializes them).
    learner = BCLearner(network=policy_network,
                        learning_rate=FLAGS.learning_rate,
                        dataset=dataset,
                        counter=counter)

    # Run the environment loop.
    for _ in tqdm(range(FLAGS.epochs)):
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        eval_loop.run(FLAGS.evaluation_episodes)

    learner.save(tag=FLAGS.logs_tag)
예제 #10
0
def main(_):
    # Initialize Neptune and create an experiment.
    neptune.init(FLAGS.neptune_project_name)
    experiment = neptune.create_experiment(name='Acme example')

    # Create an environment, grab the spec, and use it to create networks.
    environment = make_environment()
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = make_networks(environment_spec.actions)

    # Construct the agent.
    agent = d4pg.D4PG(
        environment_spec=environment_spec,
        policy_network=agent_networks['policy'],
        critic_network=agent_networks['critic'],
        observation_network=agent_networks['observation'],
        sigma=1.0,  # pytype: disable=wrong-arg-types
        logger=make_logger(experiment, prefix='learner'),
    )

    # Create the environment loop used for training.
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      label='train_loop',
                                      logger=make_logger(
                                          experiment,
                                          prefix='train',
                                          smoothing_regex='return'))

    # Create the evaluation policy.
    eval_policy = snt.Sequential([
        agent_networks['observation'],
        agent_networks['policy'],
    ])

    # Create the evaluation actor and loop.
    eval_actor = actors.FeedForwardActor(policy_network=eval_policy)
    eval_env = make_environment()
    eval_logger = make_logger(experiment,
                              prefix='eval',
                              aggregate_regex='return')
    eval_loop = acme.EnvironmentLoop(
        eval_env,
        eval_actor,
        label='eval_loop',
        logger=eval_logger,
    )

    for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval):
        train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
        eval_loop.run(num_episodes=5)
        eval_logger.dump()
예제 #11
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and evaluator networks
        environment = self._environment_factory(True)
        agent_networks = self._network_factory(action_spec)

        # Create evaluator network.
        evaluator_network = snt.Sequential([
            agent_networks.get('observation', tf.identity),
            agent_networks.get('policy'),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(evaluator_network, [observation_spec])
        variables = {'policy': evaluator_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables,
            update_period=self._variable_update_period)

        # Make sure not to evaluate a random actor by assigning variables before
        # running the environment loop.
        variable_client.update_and_wait()

        # Create the evaluator; note it will not add experience to replay.
        evaluator = actors.FeedForwardActor(evaluator_network,
                                            variable_client=variable_client)

        # Create logger and counter.
        counter = counting.Counter(counter, 'evaluator')
        logger = loggers.make_default_logger('evaluator',
                                             time_delta=self._log_every,
                                             steps_key='evaluator_steps')

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, evaluator, counter, logger)
예제 #12
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
        epsilon: float,
    ) -> acme.EnvironmentLoop:
        """The actor process."""
        environment = self._environment_factory(False)
        network = self._network_factory(self._env_spec.actions)

        # Just inline the policy network here.
        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
        ])

        tf2_utils.create_variables(policy_network,
                                   [self._env_spec.observations])
        variable_client = tf2_variable_utils.VariableClient(
            client=variable_source,
            variables={'policy': policy_network.trainable_variables},
            update_period=self._variable_update_period)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(
            client=replay,
            n_step=self._n_step,
            discount=self._discount,
        )

        # Create the agent.
        actor = actors.FeedForwardActor(policy_network, adder, variable_client)

        # Create the loop to connect environment and agent.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             steps_key='actor_steps')
        return acme.EnvironmentLoop(environment, actor, counter, logger)
예제 #13
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""

        # Create environment and target networks to act with.
        environment = self._environment_factory(True)
        agent_networks = self._network_factory(self._environment_spec)

        # Create a stochastic behavior policy.
        evaluator_network = snt.Sequential([
            agent_networks['observation'],
            agent_networks['policy'],
            networks.StochasticMeanHead(),
        ])

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables={'policy': evaluator_network.variables},
            update_period=1000)

        # Make sure not to evaluate a random actor by assigning variables before
        # running the environment loop.
        variable_client.update_and_wait()

        # Create the agent.
        evaluator = actors.FeedForwardActor(policy_network=evaluator_network,
                                            variable_client=variable_client)

        # Create logger and counter.
        counter = counting.Counter(counter, 'evaluator')
        logger = loggers.make_default_logger('evaluator',
                                             time_delta=self._log_every)

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, evaluator, counter, logger)
예제 #14
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""
        environment = self._environment_factory(True)
        network = self._network_factory(self._env_spec.actions)

        # Just inline the policy network here.
        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, self._evaluator_epsilon).sample(),
        ])

        tf2_utils.create_variables(policy_network,
                                   [self._env_spec.observations])

        variable_client = tf2_variable_utils.VariableClient(
            client=variable_source,
            variables={'policy': policy_network.trainable_variables},
            update_period=self._variable_update_period)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Create the agent.
        actor = actors.FeedForwardActor(policy_network,
                                        variable_client=variable_client)

        # Create the run loop and return it.
        logger = loggers.make_default_logger('evaluator',
                                             steps_key='evaluator_steps')
        counter = counting.Counter(counter, 'evaluator')
        return acme.EnvironmentLoop(environment,
                                    actor,
                                    counter=counter,
                                    logger=logger)
예제 #15
0
def cal_mse(value_func, policy_net, environment, mse_samples, discount):
    sample_count = 0
    actor = actors.FeedForwardActor(policy_network=policy_net)
    timestep = environment.reset()
    actor.observe_first(timestep)
    mse = 0.0
    while sample_count < mse_samples:
        current_obs = timestep.observation
        action = actor.select_action(current_obs)
        timestep = environment.step(action)
        actor.observe(action, next_timestep=timestep)
        next_obs = timestep.observation
        reward = timestep.reward

        if timestep.last():
            timestep = environment.reset()
            actor.observe_first(timestep)
            current_obs = tf2_utils.add_batch_dim(current_obs)
            action = tf2_utils.add_batch_dim(action)
            mse_one = (reward - value_func(current_obs, action))**2
            print(value_func(current_obs, action).numpy().squeeze())
            print(f'reward = {reward}')
            print('=====End Episode=====')

        else:
            next_action = tf2_utils.add_batch_dim(
                actor.select_action(next_obs))
            action = tf2_utils.add_batch_dim(action)
            current_obs = tf2_utils.add_batch_dim(current_obs)
            next_obs = tf2_utils.add_batch_dim(next_obs)
            mse_one = (reward + discount * value_func(next_obs, next_action) -
                       value_func(current_obs, action))**2
            print(value_func(current_obs, action).numpy().squeeze())
        mse = mse + mse_one.numpy()
        sample_count += 1
    return mse.squeeze() / mse_samples
예제 #16
0
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = single_precision.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    # Build demonstration dataset.
    if hasattr(raw_environment, 'raw_env'):
        raw_environment = raw_environment.raw_env

    batch_dataset = bsuite_demonstrations.make_dataset(raw_environment,
                                                       stochastic=False)
    # Combine with demonstration dataset.
    transition = functools.partial(_n_step_transition_from_episode,
                                   n_step=1,
                                   additional_discount=1.)

    dataset = batch_dataset.map(transition)

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Create the networks to optimize.
    policy_network = make_policy_network(environment_spec.actions)

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_network = snt.Sequential([
        policy_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [environment_spec.observations])

    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluation_network = actors.FeedForwardActor(evaluator_network)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluation_network,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = learning.BCLearner(network=policy_network,
                                 learning_rate=FLAGS.learning_rate,
                                 dataset=dataset,
                                 counter=learner_counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
예제 #17
0
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.Module,
        batch_size: int = 256,
        prefetch_size: int = 4,
        target_update_period: int = 100,
        samples_per_insert: float = 32.0,
        min_replay_size: int = 1000,
        max_replay_size: int = 1000000,
        importance_sampling_exponent: float = 0.2,
        priority_exponent: float = 0.6,
        n_step: int = 5,
        epsilon: tf.Tensor = None,
        learning_rate: float = 1e-3,
        discount: float = 0.99,
        cql_alpha: float = 1.,
        logger: loggers.Logger = None,
        counter: counting.Counter = None,
        checkpoint_subpath: str = '~/acme/',
    ):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      network: the online Q network (the one being optimized)
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_update_period: number of learner steps to perform before updating
        the target networks.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      min_replay_size: minimum replay size before updating. This and all
        following arguments are related to dataset construction and will be
        ignored if a dataset argument is passed.
      max_replay_size: maximum replay size.
      importance_sampling_exponent: power to which importance weights are raised
        before normalizing.
      priority_exponent: exponent used in prioritized sampling.
      n_step: number of steps to squash into a single transition.
      epsilon: probability of taking a random action; ignored if a policy
        network is given.
      learning_rate: learning rate for the q-network update.
      discount: discount to use for TD updates.
      logger: logger object to be used by learner.
      checkpoint: boolean indicating whether to checkpoint the learner.
      checkpoint_subpath: directory for the checkpoint.
    """

        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Prioritized(priority_exponent),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(client=reverb.Client(address),
                                            n_step=n_step,
                                            discount=discount)

        # The dataset provides an interface to sample from replay.
        replay_client = reverb.TFClient(address)
        dataset = datasets.make_reverb_dataset(
            client=replay_client,
            environment_spec=environment_spec,
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            transition_adder=True)

        # Use constant 0.05 epsilon greedy policy by default.
        if epsilon is None:
            epsilon = tf.Variable(0.05, trainable=False)
        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
        ])

        # Create a target network.
        target_network = copy.deepcopy(network)

        # Ensure that we create the variables before proceeding (maybe not needed).
        tf2_utils.create_variables(network, [environment_spec.observations])
        tf2_utils.create_variables(target_network,
                                   [environment_spec.observations])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(policy_network, adder)

        # The learner updates the parameters (and initializes them).
        learner = CQLLearner(
            network=network,
            discount=discount,
            importance_sampling_exponent=importance_sampling_exponent,
            learning_rate=learning_rate,
            cql_alpha=cql_alpha,
            target_update_period=target_update_period,
            dataset=dataset,
            replay_client=replay_client,
            logger=logger,
            counter=counter,
            checkpoint_subpath=checkpoint_subpath)

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
예제 #18
0
파일: agent.py 프로젝트: BlackDeal/acme
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        policy_network: snt.Module,
        critic_network: snt.Module,
        observation_network: types.TensorTransformation = tf.identity,
        discount: float = 0.99,
        batch_size: int = 256,
        prefetch_size: int = 4,
        target_policy_update_period: int = 100,
        target_critic_update_period: int = 100,
        min_replay_size: int = 1000,
        max_replay_size: int = 1000000,
        samples_per_insert: float = 32.0,
        policy_loss_module: snt.Module = None,
        policy_optimizer: snt.Optimizer = None,
        critic_optimizer: snt.Optimizer = None,
        n_step: int = 5,
        num_samples: int = 20,
        clipping: bool = True,
        logger: loggers.Logger = None,
        counter: counting.Counter = None,
        checkpoint: bool = True,
        replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE,
    ):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      policy_network: the online (optimized) policy.
      critic_network: the online critic.
      observation_network: optional network to transform the observations before
        they are fed into any network.
      discount: discount to use for TD updates.
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_policy_update_period: number of updates to perform before updating
        the target policy network.
      target_critic_update_period: number of updates to perform before updating
        the target critic network.
      min_replay_size: minimum replay size before updating.
      max_replay_size: maximum replay size.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      policy_loss_module: configured MPO loss function for the policy
        optimization; defaults to sensible values on the control suite.
        See `acme/tf/losses/mpo.py` for more details.
      policy_optimizer: optimizer to be used on the policy.
      critic_optimizer: optimizer to be used on the critic.
      n_step: number of steps to squash into a single transition.
      num_samples: number of actions to sample when doing a Monte Carlo
        integration with respect to the policy.
      clipping: whether to clip gradients by global norm.
      logger: logging object used to write to logs.
      counter: counter object used to keep track of steps.
      checkpoint: boolean indicating whether to checkpoint the learner.
      replay_table_name: string indicating what name to give the replay table.
    """

        # Create a replay server to add data to.
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Uniform(),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(client=reverb.Client(address),
                                            n_step=n_step,
                                            discount=discount)

        # The dataset object to learn from.
        dataset = datasets.make_reverb_dataset(
            table=replay_table_name,
            client=reverb.TFClient(address),
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            environment_spec=environment_spec,
            transition_adder=True)

        # Make sure observation network is a Sonnet Module.
        observation_network = tf2_utils.to_sonnet_module(observation_network)

        # Create target networks before creating online/target network variables.
        target_policy_network = copy.deepcopy(policy_network)
        target_critic_network = copy.deepcopy(critic_network)
        target_observation_network = copy.deepcopy(observation_network)

        # Get observation and action specs.
        act_spec = environment_spec.actions
        obs_spec = environment_spec.observations
        emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])

        # Create the behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            policy_network,
            networks.StochasticSamplingHead(),
        ])

        # Create variables.
        tf2_utils.create_variables(policy_network, [emb_spec])
        tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_policy_network, [emb_spec])
        tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_observation_network, [obs_spec])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(policy_network=behavior_network,
                                        adder=adder)

        # Create optimizers.
        policy_optimizer = policy_optimizer or snt.optimizers.Adam(1e-4)
        critic_optimizer = critic_optimizer or snt.optimizers.Adam(1e-4)

        # The learner updates the parameters (and initializes them).
        learner = learning.MPOLearner(
            policy_network=policy_network,
            critic_network=critic_network,
            observation_network=observation_network,
            target_policy_network=target_policy_network,
            target_critic_network=target_critic_network,
            target_observation_network=target_observation_network,
            policy_loss_module=policy_loss_module,
            policy_optimizer=policy_optimizer,
            critic_optimizer=critic_optimizer,
            clipping=clipping,
            discount=discount,
            num_samples=num_samples,
            target_policy_update_period=target_policy_update_period,
            target_critic_update_period=target_critic_update_period,
            dataset=dataset,
            logger=logger,
            counter=counter,
            checkpoint=checkpoint)

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
예제 #19
0
def main(_):
    # TODO(yutian): Create environment.
    # # Create an environment and grab the spec.
    # raw_environment = bsuite.load_and_record_to_csv(
    #     bsuite_id=FLAGS.bsuite_id,
    #     results_dir=FLAGS.results_dir,
    #     overwrite=FLAGS.overwrite,
    # )
    # environment = single_precision.SinglePrecisionWrapper(raw_environment)
    # environment_spec = specs.make_environment_spec(environment)

    # TODO(yutian): Create dataset.
    # Build the dataset.
    # if hasattr(raw_environment, 'raw_env'):
    #   raw_environment = raw_environment.raw_env
    #
    # batch_dataset = bsuite_demonstrations.make_dataset(raw_environment)
    # # Combine with demonstration dataset.
    # transition = functools.partial(
    #     _n_step_transition_from_episode, n_step=1, additional_discount=1.)
    #
    # dataset = batch_dataset.map(transition)
    #
    # # Batch and prefetch.
    # dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Create the networks to optimize.
    networks = make_networks(environment_spec.actions)
    treatment_net = networks['treatment_net']
    instrumental_net = networks['instrumental_net']
    policy_net = networks['policy_net']

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_net = snt.Sequential([
        policy_net,
        # Sample actions.
        acme_nets.StochasticSamplingHead()
    ])

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_net, [environment_spec.observations])
    # TODO(liyuan): set the proper input spec using environment_spec.observations
    # and environment_spec.actions.
    tf2_utils.create_variables(treatment_net, [environment_spec.observations])
    tf2_utils.create_variables(
        instrumental_net,
        [environment_spec.observations, environment_spec.actions])

    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluator_net = actors.FeedForwardActor(evaluator_net)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluator_net,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = learning.DFIVLearner(
        treatment_net=treatment_net,
        instrumental_net=instrumental_net,
        policy_net=policy_net,
        treatment_learning_rate=FLAGS.treatment_learning_rate,
        instrumental_learning_rate=FLAGS.instrumental_learning_rate,
        policy_learning_rate=FLAGS.policy_learning_rate,
        dataset=dataset,
        counter=learner_counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
예제 #20
0
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.Module,
        params=None,
        logger: loggers.Logger = None,
        checkpoint: bool = True,
        paths: Save_paths = None,
    ):
        """Initialize the agent.

        Args:
          environment_spec: description of the actions, observations, etc.
          network: the online Q network (the one being optimized)
          batch_size: batch size for updates.
          prefetch_size: size to prefetch from replay.
          target_update_period: number of learner steps to perform before updating
            the target networks.
          samples_per_insert: number of samples to take from replay for every insert
            that is made.
          min_replay_size: minimum replay size before updating. This and all
            following arguments are related to dataset construction and will be
            ignored if a dataset argument is passed.
          max_replay_size: maximum replay size.
          importance_sampling_exponent: power to which importance weights are raised
            before normalizing.
          priority_exponent: exponent used in prioritized sampling.
          n_step: number of steps to squash into a single transition.
          epsilon: probability of taking a random action; ignored if a policy
            network is given.
          learning_rate: learning rate for the q-network update.
          discount: discount to use for TD updates.
          logger: logger object to be used by learner.
          checkpoint: boolean indicating whether to checkpoint the learner.
        """

        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        if params is None:
            params = {
                'batch_size': 256,
                'prefetch_size': 4,
                'target_update_period': 100,
                'samples_per_insert': 32.0,
                'min_replay_size': 1000,
                'max_replay_size': 1000000,
                'importance_sampling_exponent': 0.2,
                'priority_exponent': 0.6,
                'n_step': 5,
                'epsilon': 0.05,
                'learning_rate': 1e-3,
                'discount': 0.99,
            }
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Prioritized(params['priority_exponent']),
            remover=reverb.selectors.Fifo(),
            max_size=params['max_replay_size'],
            rate_limiter=reverb.rate_limiters.MinSize(1))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(client=reverb.Client(address),
                                            n_step=params['n_step'],
                                            discount=params['discount'])

        # The dataset provides an interface to sample from replay.
        replay_client = reverb.TFClient(address)
        dataset = datasets.make_reverb_dataset(
            client=replay_client,
            environment_spec=environment_spec,
            batch_size=params['batch_size'],
            prefetch_size=params['prefetch_size'],
            transition_adder=True)

        # Use constant 0.05 epsilon greedy policy by default.
        epsilon = tf.Variable(params['epsilon'], trainable=False)

        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
        ])

        # Create a target network.
        target_network = copy.deepcopy(network)

        # Ensure that we create the variables before proceeding (maybe not needed).
        # tf2_utils.create_variables(network, [environment_spec.observations])
        # tf2_utils.create_variables(target_network, [environment_spec.observations])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(policy_network, adder)

        # The learner updates the parameters (and initializes them).
        learner = learning.DQNLearner(
            network=network,
            target_network=target_network,
            discount=params['discount'],
            importance_sampling_exponent=params[
                'importance_sampling_exponent'],
            learning_rate=params['learning_rate'],
            target_update_period=params['target_update_period'],
            dataset=dataset,
            replay_client=replay_client,
            logger=logger,
            checkpoint=checkpoint)

        if checkpoint:
            self._checkpointer = tf2_savers.Checkpointer(
                add_uid=False,
                objects_to_save=learner.state,
                directory=paths.data_dir,
                subdirectory=paths.experiment_name,
                time_delta_minutes=60.)
        else:
            self._checkpointer = None

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(params['batch_size'],
                                              params['min_replay_size']),
                         observations_per_step=float(params['batch_size']) /
                         params['samples_per_insert'])
예제 #21
0
        max_replay_size=args.replay_table_max_replay_size,
        min_replay_size=args.min_replay_size,
        shutdown_table_name=args.shutdown_table_name,
        device_placement=args.learner_device_placement,
        batch_size=args.batch_size,
        broadcaster_table_name=args.broadcaster_table_name)

    # Create the evaluation policy.
    with tf.device(args.learner_device_placement):

        # Create the behavior policy.
        eval_policy = snt.Sequential([
            agent_networks['observation'],
            agent_networks['policy'],
        ])
        eval_actor = actors.FeedForwardActor(policy_network=eval_policy)

    eval_env = make_environment(args.taskstr)
    eval_loop = CustomEnvironmentLoop(eval_env,
                                      eval_actor,
                                      label='%s/' % (args.logpath))

    def broadcast_shutdown(should_shutdown):
        learner.client.insert(should_shutdown, {args.shutdown_table_name: 1.0})

    steps = 0

    def broadcast_variables(weights):
        if weights is None:
            weights = [
                tf2_utils.to_numpy(v)
예제 #22
0
def main(_):
    problem_config = FLAGS.problem_config

    # Load the offline dataset and environment.
    _, _, environment = utils.load_data_and_env(
        task_name=problem_config['task_name'],
        noise_level=problem_config['noise_level'],
        near_policy_dataset=problem_config['near_policy_dataset'],
        dataset_path=FLAGS.dataset_path,
        batch_size=1)
    environment_spec = specs.make_environment_spec(environment)

    # Load pretrained target policy network.
    policy_net = utils.load_policy_net(
        task_name=problem_config['task_name'],
        noise_level=problem_config['noise_level'],
        near_policy_dataset=problem_config['near_policy_dataset'],
        dataset_path=FLAGS.dataset_path,
        environment_spec=environment_spec)

    actor = actors.FeedForwardActor(policy_network=policy_net)

    logger = loggers.TerminalLogger('ground_truth')

    discount = problem_config['discount']

    returns = []
    lengths = []

    t_start = time.time()
    timestep = environment.reset()
    actor.observe_first(timestep)
    cur_return = 0.
    cur_step = 0
    while len(returns) < FLAGS.num_episodes:

        action = actor.select_action(timestep.observation)
        timestep = environment.step(action)
        # Have the agent observe the timestep and let the actor update itself.
        actor.observe(action, next_timestep=timestep)

        cur_return += pow(discount, cur_step) * timestep.reward
        cur_step += 1

        if timestep.last():
            # Append return of the current episode, and reset the environment.
            returns.append(cur_return)
            lengths.append(cur_step)
            timestep = environment.reset()
            actor.observe_first(timestep)
            cur_return = 0.
            cur_step = 0

            if len(returns) % (FLAGS.num_episodes // 10) == 0:
                print(
                    f'Run time {time.time() - t_start:0.0f} secs, '
                    f'evaluated episode {len(returns)} / {FLAGS.num_episodes}')

    # Returned data include problem configs.
    results = {
        '_'.join(keys): value
        for keys, value in tree.flatten_with_path(problem_config)
    }

    # And computed results.
    results.update({
        'metric_value':
        np.mean(returns),
        'metric_std_dev':
        np.std(returns, ddof=0),
        'metric_std_err':
        np.std(returns, ddof=0) / np.sqrt(len(returns)),
        'length_mean':
        np.mean(lengths),
        'length_std':
        np.std(lengths, ddof=0),
        'num_episodes':
        len(returns),
    })
    logger.write(results)
예제 #23
0
def _generate_data(
    policy_net,
    environment,
    n_samples,
    batch_size,
    shuffle,
    include_terminal=False,  # Include terminal absorbing state.
    ignore_d_tm1=False  # Set d_tm1 as constant 1.0 if True.
):
    sample_count = 0
    actor = actors.FeedForwardActor(policy_network=policy_net)
    timestep = environment.reset()
    actor.observe_first(timestep)

    current_obs_list = []
    action_list = []
    next_obs_list = []
    reward_list = []
    discount_list = []
    nonterminal_list = []
    while sample_count < n_samples:
        current_obs = timestep.observation
        action = actor.select_action(current_obs)
        timestep = environment.step(action)
        actor.observe(action, next_timestep=timestep)
        next_obs = timestep.observation
        reward = timestep.reward
        discount = np.array(1.0, dtype=np.float32)
        if timestep.last() and not include_terminal:
            discount = np.array(0.0, dtype=np.float32)

        current_obs_list.append(tf2_utils.add_batch_dim(current_obs))
        action_list.append(tf2_utils.add_batch_dim(action))
        reward_list.append(tf2_utils.add_batch_dim(reward))
        discount_list.append(tf2_utils.add_batch_dim(discount))
        next_obs_list.append(tf2_utils.add_batch_dim(next_obs))
        nonterminal_list.append(
            tf2_utils.add_batch_dim(np.array(1.0, dtype=np.float32)))

        if timestep.last():
            if include_terminal:
                # Make another transition tuple from s, a -> s, a with 0 reward.
                current_obs = next_obs
                # action = actor.select_action(current_obs)
                reward = np.zeros_like(timestep.reward)
                discount = np.array(1.0, dtype=np.float32)
                next_obs = current_obs

                if ignore_d_tm1:
                    d_tm1 = np.array(1.0, dtype=np.float32)
                else:
                    d_tm1 = np.array(0.0, dtype=np.float32)

                for i in range(environment.action_spec().num_values):
                    action_ = np.array(i, dtype=action.dtype).reshape(
                        action.shape)

                    current_obs_list.append(
                        tf2_utils.add_batch_dim(current_obs))
                    action_list.append(tf2_utils.add_batch_dim(action_))
                    reward_list.append(tf2_utils.add_batch_dim(reward))
                    discount_list.append(tf2_utils.add_batch_dim(discount))
                    next_obs_list.append(tf2_utils.add_batch_dim(next_obs))
                    nonterminal_list.append(tf2_utils.add_batch_dim(d_tm1))

            timestep = environment.reset()
            actor.observe_first(timestep)

        sample_count += 1

    current_obs_data = tf.concat(current_obs_list, axis=0)
    action_data = tf.concat(action_list, axis=0)
    next_obs_data = tf.concat(next_obs_list, axis=0)
    reward_data = tf.concat(reward_list, axis=0)
    discount_data = tf.concat(discount_list, axis=0)
    nonterminal_data = tf.concat(nonterminal_list, axis=0)

    dataset = tf.data.Dataset.from_tensor_slices((
        current_obs_data,
        action_data,
        reward_data,
        discount_data,
        next_obs_data,
        # The last action is not valid
        # and should not be used.
        action_data,
        nonterminal_data))

    def _reverb_sample(*data_tuple):
        info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                                 probability=tf.constant(1.0, tf.float64),
                                 table_size=tf.constant(0, tf.int64),
                                 priority=tf.constant(1.0, tf.float64))
        return reverb.ReplaySample(info=info, data=data_tuple)

    dataset = dataset.map(_reverb_sample,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(batch_size * 10)
    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset
예제 #24
0
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.Module,
        demonstration_dataset: tf.data.Dataset,
        demonstration_ratio: float,
        batch_size: int = 256,
        prefetch_size: int = 4,
        target_update_period: int = 100,
        samples_per_insert: float = 32.0,
        min_replay_size: int = 1000,
        max_replay_size: int = 1000000,
        importance_sampling_exponent: float = 0.2,
        n_step: int = 5,
        epsilon: tf.Tensor = None,
        learning_rate: float = 1e-3,
        discount: float = 0.99,
    ):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      network: the online Q network (the one being optimized)
      demonstration_dataset: tf.data.Dataset producing (timestep, action)
        tuples containing full episodes.
      demonstration_ratio: Ratio of transitions coming from demonstrations.
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_update_period: number of learner steps to perform before updating
        the target networks.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      min_replay_size: minimum replay size before updating. This and all
        following arguments are related to dataset construction and will be
        ignored if a dataset argument is passed.
      max_replay_size: maximum replay size.
      importance_sampling_exponent: power to which importance weights are raised
        before normalizing.
      n_step: number of steps to squash into a single transition.
      epsilon: probability of taking a random action; ignored if a policy
        network is given.
      learning_rate: learning rate for the q-network update.
      discount: discount to use for TD updates.
    """

        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Uniform(),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(client=reverb.Client(address),
                                            n_step=n_step,
                                            discount=discount)

        # The dataset provides an interface to sample from replay.
        replay_client = reverb.TFClient(address)
        dataset = datasets.make_reverb_dataset(
            client=replay_client,
            environment_spec=environment_spec,
            transition_adder=True)

        # Combine with demonstration dataset.
        transition = functools.partial(_n_step_transition_from_episode,
                                       n_step=n_step,
                                       discount=discount)
        dataset_demos = demonstration_dataset.map(transition)
        dataset = tf.data.experimental.sample_from_datasets(
            [dataset, dataset_demos],
            [1 - demonstration_ratio, demonstration_ratio])

        # Batch and prefetch.
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.prefetch(prefetch_size)

        # Use constant 0.05 epsilon greedy policy by default.
        if epsilon is None:
            epsilon = tf.Variable(0.05, trainable=False)
        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
        ])

        # Create a target network.
        target_network = copy.deepcopy(network)

        # Ensure that we create the variables before proceeding (maybe not needed).
        tf2_utils.create_variables(network, [environment_spec.observations])
        tf2_utils.create_variables(target_network,
                                   [environment_spec.observations])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(policy_network, adder)

        # The learner updates the parameters (and initializes them).
        learner = dqn.DQNLearner(
            network=network,
            target_network=target_network,
            discount=discount,
            importance_sampling_exponent=importance_sampling_exponent,
            learning_rate=learning_rate,
            target_update_period=target_update_period,
            dataset=dataset,
            replay_client=replay_client)

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
예제 #25
0
def main(_):
    wb_run = init_or_resume()

    if FLAGS.seed:
        tf.random.set_seed(FLAGS.seed)

    # Create an environment and grab the spec.
    environment, env_spec = _build_environment(
        FLAGS.environment_name, max_steps=FLAGS.max_eval_episode_len)

    # Load demonstration dataset.
    raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir)
    empirical_policy = compute_empirical_policy(raw_dataset)

    dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size,
                                 FLAGS.n_step_returns, FLAGS.discount)

    # Create the policy and critic networks.
    critic_network = networks.get_default_critic(env_spec)

    policy_network = snt.Sequential(
        [copy.deepcopy(critic_network), tfp.distributions.Categorical])

    if FLAGS.greedy:
        head = networks.GreedyHead()
    else:
        head = StochasticSamplingHead()

    behaviour_network = snt.Sequential([policy_network, head])

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [env_spec.observations])
    tf2_utils.create_variables(critic_network, [env_spec.observations])

    # Create the actor which defines how we take actions.
    evaluation_actor = actors.FeedForwardActor(behaviour_network)

    counter = counting.Counter()

    disp, disp_loop = _build_custom_loggers(wb_run)

    eval_loop = EnvironmentLoop(environment=environment,
                                actor=evaluation_actor,
                                counter=counter,
                                logger=disp_loop)

    learner = CRRLearner(
        policy_network=policy_network,
        critic_network=critic_network,
        dataset=dataset,
        discount=0.99,
        policy_improvement_modes=FLAGS.policy_improvement_mode,
        beta=FLAGS.crr_beta,
        cql_alpha=FLAGS.cql_alpha,
        empirical_policy=empirical_policy,
        logger=disp,
        counter=counter)

    # Run the environment loop.
    for e in tqdm(range(FLAGS.epochs)):
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        eval_loop.run(FLAGS.evaluation_episodes)
        # Visualization of the policy
        Q = evaluate_q(learner._critic_network, environment)
        plot = visualize_policy(Q, environment)
        wb_run.log({'chart': plot, 'epoch_counter': e})

    learner.save(tag=FLAGS.logs_tag)
예제 #26
0
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.Module,
        batch_size: int = 32,
        prefetch_size: int = 4,
        target_update_period: int = 100,
        samples_per_insert: float = 32.0,
        min_replay_size: int = 1000,
        max_replay_size: int = 100000,
        importance_sampling_exponent: float = 0.2,
        priority_exponent: float = 0.6,
        n_step: int = 5,
        epsilon: Optional[float] = 0.05,
        learning_rate: float = 1e-3,
        discount: float = 0.99,
        logger: loggers.Logger = None,
        max_gradient_norm: Optional[float] = None,
        expert_data: List[Dict] = None,
    ) -> None:
        """ Initialize the agent. """

        # Create a replay server to add data to. This uses no limiter behavior
        # in order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Prioritized(priority_exponent),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(client=reverb.Client(address),
                                            n_step=n_step,
                                            discount=discount)

        # Adding expert data to the replay memory:
        if expert_data is not None:
            for d in expert_data:
                adder.add_first(d["first"])
                for (action, next_ts) in d["mid"]:
                    adder.add(np.int32(action), next_ts)

        # The dataset provides an interface to sample from replay.
        replay_client = reverb.TFClient(address)
        dataset = datasets.make_reverb_dataset(server_address=address,
                                               batch_size=batch_size,
                                               prefetch_size=prefetch_size)

        # Creating the epsilon greedy policy network:
        epsilon = tf.Variable(epsilon)
        policy_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
        ])

        # Create a target network.
        target_network = copy.deepcopy(network)

        # Ensure that we create the variables before proceeding (maybe not
        # needed).
        tf2_utils.create_variables(network, [environment_spec.observations])
        tf2_utils.create_variables(target_network,
                                   [environment_spec.observations])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(policy_network, adder)

        # The learner updates the parameters (and initializes them).
        learner = learning.DQNLearner(
            network=network,
            target_network=target_network,
            discount=discount,
            importance_sampling_exponent=importance_sampling_exponent,
            learning_rate=learning_rate,
            target_update_period=target_update_period,
            dataset=dataset,
            replay_client=replay_client,
            max_gradient_norm=max_gradient_norm,
            logger=logger,
        )

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
예제 #27
0
behavior_network = snt.Sequential([
    observation_network,
    policy_network,
    networks.ClippedGaussian(0.3), #sigma = 0.3
    networks.ClipToSpec(act_spec),
])

# We must create the variables in the networks before passing them to learner.
# Create variables.
tf2_utils.create_variables(policy_network, [emb_spec])
tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_policy_network, [emb_spec])
tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_observation_network, [obs_spec])

actor = actors.FeedForwardActor(behavior_network, adder=adder)

learner = d4pg.D4PGLearner(policy_network=policy_network,
                           critic_network=critic_network,
                           observation_network=observation_network,
                           target_policy_network=target_policy_network,
                           target_critic_network=target_critic_network,
                           target_observation_network=target_observation_network,
                           dataset=dataset,
                           discount=0.99,
                           clipping=True,
                           target_update_period=100,
                           policy_optimizer=snt.optimizers.Adam(1e-4),
                           critic_optimizer=snt.optimizers.Adam(1e-4),
                           # Log learner updates to console every 10 seconds.
                           logger=loggers.TerminalLogger(time_delta=10.),
예제 #28
0
def main(_):
    wb_run = init_or_resume()

    if FLAGS.seed:
        tf.random.set_seed(FLAGS.seed)
    # Create an environment and grab the spec.
    environment, env_spec = _build_environment(
        FLAGS.environment_name, max_steps=FLAGS.max_eval_episode_len)

    # Load demonstration dataset.
    raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir)
    empirical_policy = compute_empirical_policy(raw_dataset)

    dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size,
                                 FLAGS.n_step_returns, FLAGS.discount)

    # Create the main critic network
    critic_network = networks.get_default_critic(env_spec)

    policy_network = snt.Sequential([
        critic_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    tf2_utils.create_variables(critic_network, [env_spec.observations])

    # Create the actor which defines how we take actions.
    evaluation_actor = actors.FeedForwardActor(policy_network)

    counter = counting.Counter()

    disp, disp_loop = _build_custom_loggers(wb_run)

    eval_loop = EnvironmentLoop(environment=environment,
                                actor=evaluation_actor,
                                counter=counter,
                                logger=disp_loop)

    learner = CQLLearner(network=critic_network,
                         dataset=dataset,
                         discount=FLAGS.discount,
                         importance_sampling_exponent=0.2,
                         learning_rate=FLAGS.learning_rate,
                         cql_alpha=FLAGS.cql_alpha,
                         translate_lse=FLAGS.translate_lse,
                         target_update_period=100,
                         empirical_policy=empirical_policy,
                         logger=disp,
                         counter=counter)

    # Run the environment loop.
    for e in tqdm(range(FLAGS.epochs)):
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        eval_loop.run(FLAGS.evaluation_episodes)
        # Visualization of the policy
        Q = evaluate_q(learner._network, environment)
        plot = visualize_policy(Q, environment)
        wb_run.log({'chart': plot, 'epoch_counter': e})

    learner.save(tag=FLAGS.logs_tag)
예제 #29
0
    def __init__(self,
                 environment_spec: specs.EnvironmentSpec,
                 policy_network: snt.Module,
                 critic_network: snt.Module,
                 observation_network: types.TensorTransformation = tf.identity,
                 discount: float = 0.99,
                 batch_size: int = 256,
                 prefetch_size: int = 4,
                 target_update_period: int = 100,
                 min_replay_size: int = 1000,
                 max_replay_size: int = 1000000,
                 samples_per_insert: float = 32.0,
                 n_step: int = 5,
                 sigma: float = 0.3,
                 clipping: bool = True,
                 logger: loggers.Logger = None,
                 counter: counting.Counter = None,
                 checkpoint: bool = True,
                 replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      policy_network: the online (optimized) policy.
      critic_network: the online critic.
      observation_network: optional network to transform the observations before
        they are fed into any network.
      discount: discount to use for TD updates.
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_update_period: number of learner steps to perform before updating
        the target networks.
      min_replay_size: minimum replay size before updating.
      max_replay_size: maximum replay size.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      n_step: number of steps to squash into a single transition.
      sigma: standard deviation of zero-mean, Gaussian exploration noise.
      clipping: whether to clip gradients by global norm.
      logger: logger object to be used by learner.
      counter: counter object used to keep track of steps.
      checkpoint: boolean indicating whether to checkpoint the learner.
      replay_table_name: string indicating what name to give the replay table.
    """
        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=replay_table_name,
            sampler=reverb.selectors.Uniform(),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(
            priority_fns={replay_table_name: lambda x: 1.},
            client=reverb.Client(address),
            n_step=n_step,
            discount=discount)

        # The dataset provides an interface to sample from replay.
        dataset = datasets.make_reverb_dataset(
            table=replay_table_name,
            client=reverb.TFClient(address),
            environment_spec=environment_spec,
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            transition_adder=True)

        # Get observation and action specs.
        act_spec = environment_spec.actions
        obs_spec = environment_spec.observations
        emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])  # pytype: disable=wrong-arg-types

        # Make sure observation network is a Sonnet Module.
        observation_network = tf2_utils.to_sonnet_module(observation_network)

        # Create target networks.
        target_policy_network = copy.deepcopy(policy_network)
        target_critic_network = copy.deepcopy(critic_network)
        target_observation_network = copy.deepcopy(observation_network)

        # Create the behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            policy_network,
            networks.ClippedGaussian(sigma),
            networks.ClipToSpec(act_spec),
        ])

        # Create variables.
        tf2_utils.create_variables(policy_network, [emb_spec])
        tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_policy_network, [emb_spec])
        tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_observation_network, [obs_spec])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(behavior_network, adder=adder)

        # Create optimizers.
        policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4)
        critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4)

        # The learner updates the parameters (and initializes them).
        learner = learning.DDPGLearner(
            policy_network=policy_network,
            critic_network=critic_network,
            observation_network=observation_network,
            target_policy_network=target_policy_network,
            target_critic_network=target_critic_network,
            target_observation_network=target_observation_network,
            policy_optimizer=policy_optimizer,
            critic_optimizer=critic_optimizer,
            clipping=clipping,
            discount=discount,
            target_update_period=target_update_period,
            dataset=dataset,
            counter=counter,
            logger=logger,
            checkpoint=checkpoint,
        )

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
예제 #30
0
파일: acme_agent.py 프로젝트: GAIPS/ILU-RL
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.Module,
        batch_size: int = 256,
        prefetch_size: int = 4,
        target_update_period: int = 100,
        samples_per_insert: float = 32.0,
        min_replay_size: int = 20,
        max_replay_size: int = 1000000,
        importance_sampling_exponent: float = 0.2,
        priority_exponent: float = 0.6,
        n_step: int = 5,
        epsilon_init: float = 1.0,
        epsilon_final: float = 0.01,
        epsilon_schedule_timesteps: int = 20000,
        learning_rate: float = 1e-3,
        discount: float = 0.99,
        max_gradient_norm: Optional[float] = None,
        logger: loggers.Logger = None,
    ):
        """Initialize the agent.

        Args:
        environment_spec: description of the actions, observations, etc.
        network: the online Q network (the one being optimized)
        batch_size: batch size for updates.
        prefetch_size: size to prefetch from replay.
        target_update_period: number of learner steps to perform before updating
            the target networks.
        samples_per_insert: number of samples to take from replay for every insert
            that is made.
        min_replay_size: minimum replay size before updating. This and all
            following arguments are related to dataset construction and will be
            ignored if a dataset argument is passed.
        max_replay_size: maximum replay size.
        importance_sampling_exponent: power to which importance weights are raised
            before normalizing (beta). See https://arxiv.org/pdf/1710.02298.pdf
        priority_exponent: exponent used in prioritized sampling (omega).
            See https://arxiv.org/pdf/1710.02298.pdf
        n_step: number of steps to squash into a single transition.
        epsilon_init: Initial epsilon value (probability of taking a random action)
        epsilon_final: Final epsilon value (probability of taking a random action)
        epsilon_schedule_timesteps: timesteps to decay epsilon from 'epsilon_init'
            to 'epsilon_final'. 
        learning_rate: learning rate for the q-network update.
        discount: discount to use for TD updates.
        logger: logger object to be used by learner.
        max_gradient_norm: used for gradient clipping.
        """

        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=adders.DEFAULT_PRIORITY_TABLE,
            sampler=reverb.selectors.Prioritized(priority_exponent),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        self._adder = adders.NStepTransitionAdder(
            client=reverb.Client(address), n_step=n_step, discount=discount)

        # The dataset provides an interface to sample from replay.
        replay_client = reverb.TFClient(address)
        dataset = make_reverb_dataset(server_address=address,
                                      batch_size=batch_size,
                                      prefetch_size=prefetch_size)

        policy_network = snt.Sequential([
            network,
            EpsilonGreedyExploration(
                epsilon_init=epsilon_init,
                epsilon_final=epsilon_final,
                epsilon_schedule_timesteps=epsilon_schedule_timesteps)
        ])

        # Create a target network.
        target_network = copy.deepcopy(network)

        # Ensure that we create the variables before proceeding (maybe not needed).
        tf2_utils.create_variables(network, [environment_spec.observations])
        tf2_utils.create_variables(target_network,
                                   [environment_spec.observations])

        # Create the actor which defines how we take actions.
        actor = actors_tf2.FeedForwardActor(policy_network, self._adder)

        # The learner updates the parameters (and initializes them).
        learner = learning.DQNLearner(
            network=network,
            target_network=target_network,
            discount=discount,
            importance_sampling_exponent=importance_sampling_exponent,
            learning_rate=learning_rate,
            target_update_period=target_update_period,
            dataset=dataset,
            replay_client=replay_client,
            max_gradient_norm=max_gradient_norm,
            logger=logger,
            checkpoint=False)

        self._saver = tf2_savers.Saver(learner.state)

        # Deterministic (max-Q) actor.
        max_Q_network = snt.Sequential([
            network,
            lambda q: trfl.epsilon_greedy(q, epsilon=0.0).sample(),
        ])
        self._deterministic_actor = actors_tf2.FeedForwardActor(max_Q_network)

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)