def run(self,
            steps: int,
            agent_ctor: Callable[[EnvironmentSpec, Logger], Tuple[Agent, Actor]],
            eval_every_steps: int = 10000
            ):

        train_logger = PrefixedTensorBoardLogger(base_logger=self._logger, prefix='train')
        test_logger = PrefixedTensorBoardLogger(base_logger=self._logger, prefix='test')

        env_spec = make_environment_spec(self.train_env)
        agent, eval_actor = agent_ctor(env_spec, train_logger)

        model_snapshotter = savers.Snapshotter(objects_to_save={
            'policy': eval_actor._network
        }, directory=self._logdir)

        t = self._counter.get_counts()['steps']
        iterations = 0
        render_interval = 5
        best_mean_progress = -np.inf
        while t < steps:
            should_render = t >= render_interval * eval_every_steps * iterations
            if should_render:
                iterations += 1
            test_result = self.test(eval_actor, render=should_render, timestep=t)
            if test_result['progress_mean'] > best_mean_progress:
                best_mean_progress = test_result['progress_mean']
                model_snapshotter.save()

            test_logger.write(test_result, step=t)
            self.train(steps=eval_every_steps, agent=agent, counter=self._counter, logger=train_logger)
            t = self._counter.get_counts()['steps']
        self.train_env.environment.close()
        self.test_env.environment.close()
Exemple #2
0
def main(_):
  env = helpers.make_environment(FLAGS.level)
  env_spec = acme.make_environment_spec(env)
  network = networks.DQNAtariNetwork(env_spec.actions.num_values)

  agent = dqn.DQN(env_spec, network)

  loop = acme.EnvironmentLoop(env, agent)
  loop.run(FLAGS.num_episodes)
def pong_experiment():

    seeds = [0, 42, 69, 360, 420]
    seed = seeds[0]
    start_time = time.strftime("%Y-%m-%d_%H-%M-%S")

    # setting torch random seed here to make sure random initialization is the same
    torch.random.manual_seed(seed)

    # creating the environment
    env_name = "PongNoFrameskip-v4"

    env_train = make_environment_atari(env_name, seed)
    env_test = make_environment_atari(env_name, seed)

    env_train_spec = acme.make_environment_spec(env_train)

    # creating the neural network
    network = VanillaNetwork(env_train_spec.observations[0].shape,
                             env_train_spec.actions[0].num_values)

    # creating the logger
    training_logger = TensorBoardLogger("runs/DQN-train-" + env_name +
                                        f"-rnd{seed}-" + start_time)
    testing_logger = TensorBoardLogger("runs/DQN-test-" + env_name +
                                       f"-rnd{seed}-" + start_time)

    # creating the agent
    agent = VanillaPartialDQN(network,
                              env_train_spec.actions[0].num_values,
                              training_logger,
                              gradient_clipping=True,
                              device='gpu',
                              seed=seed)

    training_loop = acme.EnvironmentLoop(env_train,
                                         agent,
                                         logger=training_logger)
    testing_loop = acme.EnvironmentLoop(env_test,
                                        agent,
                                        logger=testing_logger,
                                        should_update=False)

    for epoch in range(200):
        agent.training()
        training_loop.run(num_steps=250000)

        torch.save(
            network.state_dict(), "runs/DQN-train-" + env_name +
            f"-rnd{seed}-" + start_time + f"/ep{epoch}.model")

        agent.testing()
        testing_loop.run(num_episodes=30)

    training_logger.close()
    testing_logger.close()
Exemple #4
0
def cartpole_experiment():
    seeds = [0, 42, 69, 360, 420]
    seed = seeds[3]
    
    torch.random.manual_seed(seed)
    start_time = time.strftime("%Y-%m-%d_%H-%M-%S")
    
    # creating the environment
    env_name = "CartPole-v0"
    
    env_train = make_environment(env_name, seed)
    env_test = make_environment(env_name, seed)
    
    env_train_spec = acme.make_environment_spec(env_train)
    
    # creating the neural network
    from torch import nn
    network = nn.Sequential(
        nn.Linear(*env_train_spec.observations.shape, 64),
        nn.ReLU(),
        nn.Linear(64, env_train_spec.actions.num_values))
    
    # creating the logger
    training_logger = TensorBoardLogger("runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time)
    testing_logger = TensorBoardLogger("runs/DQN-test-" + env_name + f"-rnd{seed}-" + start_time)
    
    # creating the agent
    agent = VanillaDQN(
        network,
        env_train_spec.actions.num_values,
        training_logger,
        device='gpu',
        seed=seed,
        gradient_clipping=True,
        reward_clipping=True,
        learning_rate=0.000025,
        gamma=1,
    )
    
    training_loop = acme.EnvironmentLoop(env_train, agent, logger=training_logger)
    testing_loop = acme.EnvironmentLoop(env_test, agent, logger=testing_logger, should_update=False)
    
    for epoch in range(200):
        agent.training()
        training_loop.run(num_steps=250000)
        
        torch.save(network.state_dict(),
                   "runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time + f"/ep{epoch}.model")
        
        agent.testing()
        testing_loop.run(num_episodes=30)
    
    training_logger.close()
    testing_logger.close()
Exemple #5
0
def main(_):
    env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True)
    env_spec = acme.make_environment_spec(env)

    config = r2d2.R2D2Config(batch_size=16,
                             trace_length=20,
                             burn_in_length=10,
                             sequence_period=10)

    agent = r2d2.R2D2(env_spec,
                      networks=r2d2.make_atari_networks(
                          config.batch_size, env_spec),
                      config=config,
                      seed=FLAGS.seed)

    loop = acme.EnvironmentLoop(env, agent)
    loop.run(FLAGS.num_episodes)
Exemple #6
0
    def __init__(
        self,
        actor_id,
        environment_module,
        environment_fn_name,
        environment_kwargs,
        network_module,
        network_fn_name,
        network_kwargs,
        adder_module,
        adder_fn_name,
        adder_kwargs,
        replay_server_address,
        variable_server_name,
        variable_server_address,
        counter: counting.Counter = None,
        logger: loggers.Logger = None,
    ):
        # Counter and Logger
        self._actor_id = actor_id
        self._counter = counter or counting.Counter()
        self._logger = logger or loggers.make_default_logger(
            f'actor_{actor_id}')

        # Create the environment
        self._environment = getattr(environment_module,
                                    environment_fn_name)(**environment_kwargs)
        env_spec = acme.make_environment_spec(self._environment)

        # Create actor's network
        self._network = getattr(network_module,
                                network_fn_name)(**network_kwargs)
        tf2_utils.create_variables(self._network, [env_spec.observations])

        self._variables = tree.flatten(self._network.variables)
        self._policy = tf.function(self._network)

        # The adder is used to insert observations into replay.
        self._adder = getattr(adder_module, adder_fn_name)(
            reverb.Client(replay_server_address), **adder_kwargs)

        variable_client = reverb.TFClient(variable_server_address)
        self._variable_dataset = variable_client.dataset(
            table=variable_server_name,
            dtypes=[tf.float32 for _ in self._variables],
            shapes=[v.shape for v in self._variables])
Exemple #7
0
def get_env_agent():
    """Create env and agent.

    Returns:
        env_acme (acme.wrappers.observation_action_reward.
            ObservationActionRewardWrappe).

        agent (acme.agents.tf.dqn.agent.DQN).
    """
    # Get environment
    env_acme = make_environmment()
    env_spec = acme.make_environment_spec(env_acme)

    # Create agent and network
    network = networks.DQNAtariNetwork(env_spec.actions.num_values)
    agent = dqn.DQN(env_spec, network, checkpoint_subpath="./acme")

    return env_acme, agent
Exemple #8
0
def train(network=None, expert_data_path=None):
    env = make_env()
    env_spec = acme.make_environment_spec(env)

    if network is None:
        network = make_dqn(env_spec.actions.num_values)

    expert_data = None
    if expert_data_path is not None:
        with open(expert_data_path, "rb") as handle:
            expert_data = pickle.load(handle)
        num_timesteps = np.sum([1 + len(ep["mid"]) for ep in expert_data])
        print(f"Using expert data from {expert_data_path}. "
              f"Episodes: {len(expert_data)}. Timesteps: {num_timesteps}.")

    agent = DQNAgent(environment_spec=env_spec,
                     network=network,
                     batch_size=32,
                     learning_rate=1e-4,
                     logger=loggers.NoOpLogger(),
                     min_replay_size=1000,
                     max_replay_size=int(1e5),
                     target_update_period=2500,
                     epsilon=tf.Variable(0.025),
                     n_step=20,
                     discount=0.97,
                     expert_data=expert_data)

    loop = EnvironmentLoop(environment=env,
                           actor=agent,
                           module2save=network)
    reward_history = loop.run(num_steps=int(1e6),
                              render=True,
                              checkpoint=True,
                              checkpoint_freq=15)

    avg_hist = [np.mean(reward_history[i:(i+50)])
                for i in range(len(reward_history) - 50)]
    plt.plot(list(range(len(avg_hist))), avg_hist)
    plt.show()

    env.close()
    return network
    def test_loop_run(self):
        raw_env = rl_environment.Environment('tic_tac_toe')
        env = open_spiel_wrapper.OpenSpielWrapper(raw_env)
        env = wrappers.SinglePrecisionWrapper(env)
        environment_spec = acme.make_environment_spec(env)

        actors = []
        for _ in range(env.num_players):
            actors.append(RandomActor(environment_spec))

        loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop(
            env, actors)
        result = loop.run_episode()
        self.assertIn('episode_length', result)
        self.assertIn('episode_return', result)
        self.assertIn('steps_per_second', result)

        loop.run(num_episodes=10)
        loop.run(num_steps=100)
Exemple #10
0
    def test_agent(self):

        env_factory = lambda seed: fakes.fake_atari_wrapped(oar_wrapper=True)

        config = r2d2.R2D2Config(batch_size=1,
                                 trace_length=5,
                                 sequence_period=1,
                                 samples_per_insert=1.,
                                 min_replay_size=32,
                                 burn_in_length=1,
                                 prefetch_size=2,
                                 target_update_period=2500,
                                 max_replay_size=100_000,
                                 importance_sampling_exponent=0.6,
                                 priority_exponent=0.9,
                                 max_priority_weight=0.9,
                                 bootstrap_n=5,
                                 clip_rewards=False,
                                 variable_update_period=400)

        dummy_seed = 1
        agent = r2d2.DistributedR2D2FromConfig(
            environment_factory=env_factory,
            environment_spec=acme.make_environment_spec(
                env_factory(dummy_seed)),
            network_factory=functools.partial(r2d2.make_atari_networks,
                                              config.batch_size),
            config=config,
            seed=0,
            num_actors=1,
        )

        program = agent.build()
        (learner_node, ) = program.groups['learner']
        learner_node.disable_run()  # pytype: disable=attribute-error

        lp.launch(program, launch_type='test_mt')

        learner: acme.Learner = learner_node.create_handle().dereference()

        for _ in range(5):
            learner.step()
Exemple #11
0
def main(_):
    # Create an environment and grab the spec.
    env_configs = {'players': FLAGS.num_players} if FLAGS.num_players else {}
    raw_environment = rl_environment.Environment(FLAGS.game, **env_configs)

    environment = open_spiel_wrapper.OpenSpielWrapper(raw_environment)
    environment = wrappers.SinglePrecisionWrapper(
        environment)  # type: open_spiel_wrapper.OpenSpielWrapper
    environment_spec = acme.make_environment_spec(environment)

    # Build the networks.
    networks = []
    policy_networks = []
    for _ in range(environment.num_players):
        network = legal_actions.MaskedSequential([
            snt.Flatten(),
            snt.nets.MLP([50, 50, environment_spec.actions.num_values])
        ])
        policy_network = snt.Sequential([
            network,
            legal_actions.EpsilonGreedy(epsilon=0.1, threshold=-1e8)
        ])
        networks.append(network)
        policy_networks.append(policy_network)

    # Construct the agents.
    agents = []

    for network, policy_network in zip(networks, policy_networks):
        agents.append(
            dqn.DQN(environment_spec=environment_spec,
                    network=network,
                    policy_network=policy_network))

    # Run the environment loop.
    loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop(
        environment, agents)
    loop.run(num_episodes=100000)
Exemple #12
0
def main(_):
    env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True)
    env_spec = acme.make_environment_spec(env)

    config = impala.IMPALAConfig(
        batch_size=16,
        sequence_period=10,
        seed=FLAGS.seed,
    )

    networks = impala.make_atari_networks(env_spec)
    agent = impala.IMPALAFromConfig(
        environment_spec=env_spec,
        forward_fn=networks.forward_fn,
        unroll_init_fn=networks.unroll_init_fn,
        unroll_fn=networks.unroll_fn,
        initial_state_init_fn=networks.initial_state_init_fn,
        initial_state_fn=networks.initial_state_fn,
        config=config,
    )

    loop = acme.EnvironmentLoop(env, agent)
    loop.run(FLAGS.num_episodes)
Exemple #13
0
def get_env_agent():
    """Creates env and agent.

    Returns:
        env_acme (acme.wrappers.observation_action_reward.
            ObservationActionRewardWrappe).

        agent (acme.agents.tf.r2d2.agent.R2D2).
    """
    # Get environment
    env_acme = make_environmment()
    env_spec = acme.make_environment_spec(env_acme)

    # Create agent and network
    network = networks.R2D2AtariNetwork(env_spec.actions.num_values)
    agent = r2d2.R2D2(
        environment_spec=env_spec,
        network=network,
        burn_in_length=2,
        trace_length=6,
        replay_period=4,
    )

    return env_acme, agent
Exemple #14
0
def main(_):
    # Access flag value.
    level = FLAGS.task
    environment_factory = (
        lambda seed: helpers.make_environment(level=level, oar_wrapper=True))
    config = r2d2.R2D2Config()

    def net_factory(spec: specs.EnvironmentSpec):
        return r2d2_networks.make_atari_networks(config.batch_size,
                                                 env_spec=spec)

    env = environment_factory(False)
    env_spec = acme.make_environment_spec(env)

    program = r2d2.DistributedR2D2FromConfig(
        seed=0,
        environment_factory=environment_factory,
        network_factory=net_factory,
        config=config,
        num_actors=FLAGS.num_actors,
        environment_spec=env_spec,
    ).build()

    lp.launch(program, lp.LaunchType.LOCAL_MULTI_PROCESSING)
from acme.tf import utils as tf2_utils

from acme_dist_toolkit.remote_actors import RemoteRecurrentActor
from acme_dist_toolkit.remote_variable_client import RemoteVariableClient
from acme_dist_toolkit.remote_environments import create_env_fns

# Set gpu config
gpus = tf.config.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(device=gpu, enable=True)
tf.config.set_visible_devices(devices=gpus[-1], device_type='GPU')
ray.init()

# Create network
env = create_env_fns['r2d2_atari']('PongNoFrameskip-v4')
env_spec = acme.make_environment_spec(env)
network = networks.R2D2AtariNetwork(env_spec.actions.num_values)
tf2_utils.create_variables(network, [env_spec.observations])

# Create a variable replay buffer for sharing parameters
# between the learner and the actor
variable_server = reverb.Server(tables=[
    reverb.Table(name='variable_server',
                 sampler=reverb.selectors.Fifo(),
                 remover=reverb.selectors.Fifo(),
                 max_size=5,
                 rate_limiter=reverb.rate_limiters.MinSize(1)),
])
variable_server_address = f'localhost:{variable_server.port}'
variable_client = RemoteVariableClient.remote('variable_server',
                                              variable_server_address)
 def configure_agent(self, agent_ctor):
     return agent_ctor(make_environment_spec(self.train_env),
                       self.train_logger)