def run(self, steps: int, agent_ctor: Callable[[EnvironmentSpec, Logger], Tuple[Agent, Actor]], eval_every_steps: int = 10000 ): train_logger = PrefixedTensorBoardLogger(base_logger=self._logger, prefix='train') test_logger = PrefixedTensorBoardLogger(base_logger=self._logger, prefix='test') env_spec = make_environment_spec(self.train_env) agent, eval_actor = agent_ctor(env_spec, train_logger) model_snapshotter = savers.Snapshotter(objects_to_save={ 'policy': eval_actor._network }, directory=self._logdir) t = self._counter.get_counts()['steps'] iterations = 0 render_interval = 5 best_mean_progress = -np.inf while t < steps: should_render = t >= render_interval * eval_every_steps * iterations if should_render: iterations += 1 test_result = self.test(eval_actor, render=should_render, timestep=t) if test_result['progress_mean'] > best_mean_progress: best_mean_progress = test_result['progress_mean'] model_snapshotter.save() test_logger.write(test_result, step=t) self.train(steps=eval_every_steps, agent=agent, counter=self._counter, logger=train_logger) t = self._counter.get_counts()['steps'] self.train_env.environment.close() self.test_env.environment.close()
def main(_): env = helpers.make_environment(FLAGS.level) env_spec = acme.make_environment_spec(env) network = networks.DQNAtariNetwork(env_spec.actions.num_values) agent = dqn.DQN(env_spec, network) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def pong_experiment(): seeds = [0, 42, 69, 360, 420] seed = seeds[0] start_time = time.strftime("%Y-%m-%d_%H-%M-%S") # setting torch random seed here to make sure random initialization is the same torch.random.manual_seed(seed) # creating the environment env_name = "PongNoFrameskip-v4" env_train = make_environment_atari(env_name, seed) env_test = make_environment_atari(env_name, seed) env_train_spec = acme.make_environment_spec(env_train) # creating the neural network network = VanillaNetwork(env_train_spec.observations[0].shape, env_train_spec.actions[0].num_values) # creating the logger training_logger = TensorBoardLogger("runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time) testing_logger = TensorBoardLogger("runs/DQN-test-" + env_name + f"-rnd{seed}-" + start_time) # creating the agent agent = VanillaPartialDQN(network, env_train_spec.actions[0].num_values, training_logger, gradient_clipping=True, device='gpu', seed=seed) training_loop = acme.EnvironmentLoop(env_train, agent, logger=training_logger) testing_loop = acme.EnvironmentLoop(env_test, agent, logger=testing_logger, should_update=False) for epoch in range(200): agent.training() training_loop.run(num_steps=250000) torch.save( network.state_dict(), "runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time + f"/ep{epoch}.model") agent.testing() testing_loop.run(num_episodes=30) training_logger.close() testing_logger.close()
def cartpole_experiment(): seeds = [0, 42, 69, 360, 420] seed = seeds[3] torch.random.manual_seed(seed) start_time = time.strftime("%Y-%m-%d_%H-%M-%S") # creating the environment env_name = "CartPole-v0" env_train = make_environment(env_name, seed) env_test = make_environment(env_name, seed) env_train_spec = acme.make_environment_spec(env_train) # creating the neural network from torch import nn network = nn.Sequential( nn.Linear(*env_train_spec.observations.shape, 64), nn.ReLU(), nn.Linear(64, env_train_spec.actions.num_values)) # creating the logger training_logger = TensorBoardLogger("runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time) testing_logger = TensorBoardLogger("runs/DQN-test-" + env_name + f"-rnd{seed}-" + start_time) # creating the agent agent = VanillaDQN( network, env_train_spec.actions.num_values, training_logger, device='gpu', seed=seed, gradient_clipping=True, reward_clipping=True, learning_rate=0.000025, gamma=1, ) training_loop = acme.EnvironmentLoop(env_train, agent, logger=training_logger) testing_loop = acme.EnvironmentLoop(env_test, agent, logger=testing_logger, should_update=False) for epoch in range(200): agent.training() training_loop.run(num_steps=250000) torch.save(network.state_dict(), "runs/DQN-train-" + env_name + f"-rnd{seed}-" + start_time + f"/ep{epoch}.model") agent.testing() testing_loop.run(num_episodes=30) training_logger.close() testing_logger.close()
def main(_): env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True) env_spec = acme.make_environment_spec(env) config = r2d2.R2D2Config(batch_size=16, trace_length=20, burn_in_length=10, sequence_period=10) agent = r2d2.R2D2(env_spec, networks=r2d2.make_atari_networks( config.batch_size, env_spec), config=config, seed=FLAGS.seed) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def __init__( self, actor_id, environment_module, environment_fn_name, environment_kwargs, network_module, network_fn_name, network_kwargs, adder_module, adder_fn_name, adder_kwargs, replay_server_address, variable_server_name, variable_server_address, counter: counting.Counter = None, logger: loggers.Logger = None, ): # Counter and Logger self._actor_id = actor_id self._counter = counter or counting.Counter() self._logger = logger or loggers.make_default_logger( f'actor_{actor_id}') # Create the environment self._environment = getattr(environment_module, environment_fn_name)(**environment_kwargs) env_spec = acme.make_environment_spec(self._environment) # Create actor's network self._network = getattr(network_module, network_fn_name)(**network_kwargs) tf2_utils.create_variables(self._network, [env_spec.observations]) self._variables = tree.flatten(self._network.variables) self._policy = tf.function(self._network) # The adder is used to insert observations into replay. self._adder = getattr(adder_module, adder_fn_name)( reverb.Client(replay_server_address), **adder_kwargs) variable_client = reverb.TFClient(variable_server_address) self._variable_dataset = variable_client.dataset( table=variable_server_name, dtypes=[tf.float32 for _ in self._variables], shapes=[v.shape for v in self._variables])
def get_env_agent(): """Create env and agent. Returns: env_acme (acme.wrappers.observation_action_reward. ObservationActionRewardWrappe). agent (acme.agents.tf.dqn.agent.DQN). """ # Get environment env_acme = make_environmment() env_spec = acme.make_environment_spec(env_acme) # Create agent and network network = networks.DQNAtariNetwork(env_spec.actions.num_values) agent = dqn.DQN(env_spec, network, checkpoint_subpath="./acme") return env_acme, agent
def train(network=None, expert_data_path=None): env = make_env() env_spec = acme.make_environment_spec(env) if network is None: network = make_dqn(env_spec.actions.num_values) expert_data = None if expert_data_path is not None: with open(expert_data_path, "rb") as handle: expert_data = pickle.load(handle) num_timesteps = np.sum([1 + len(ep["mid"]) for ep in expert_data]) print(f"Using expert data from {expert_data_path}. " f"Episodes: {len(expert_data)}. Timesteps: {num_timesteps}.") agent = DQNAgent(environment_spec=env_spec, network=network, batch_size=32, learning_rate=1e-4, logger=loggers.NoOpLogger(), min_replay_size=1000, max_replay_size=int(1e5), target_update_period=2500, epsilon=tf.Variable(0.025), n_step=20, discount=0.97, expert_data=expert_data) loop = EnvironmentLoop(environment=env, actor=agent, module2save=network) reward_history = loop.run(num_steps=int(1e6), render=True, checkpoint=True, checkpoint_freq=15) avg_hist = [np.mean(reward_history[i:(i+50)]) for i in range(len(reward_history) - 50)] plt.plot(list(range(len(avg_hist))), avg_hist) plt.show() env.close() return network
def test_loop_run(self): raw_env = rl_environment.Environment('tic_tac_toe') env = open_spiel_wrapper.OpenSpielWrapper(raw_env) env = wrappers.SinglePrecisionWrapper(env) environment_spec = acme.make_environment_spec(env) actors = [] for _ in range(env.num_players): actors.append(RandomActor(environment_spec)) loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop( env, actors) result = loop.run_episode() self.assertIn('episode_length', result) self.assertIn('episode_return', result) self.assertIn('steps_per_second', result) loop.run(num_episodes=10) loop.run(num_steps=100)
def test_agent(self): env_factory = lambda seed: fakes.fake_atari_wrapped(oar_wrapper=True) config = r2d2.R2D2Config(batch_size=1, trace_length=5, sequence_period=1, samples_per_insert=1., min_replay_size=32, burn_in_length=1, prefetch_size=2, target_update_period=2500, max_replay_size=100_000, importance_sampling_exponent=0.6, priority_exponent=0.9, max_priority_weight=0.9, bootstrap_n=5, clip_rewards=False, variable_update_period=400) dummy_seed = 1 agent = r2d2.DistributedR2D2FromConfig( environment_factory=env_factory, environment_spec=acme.make_environment_spec( env_factory(dummy_seed)), network_factory=functools.partial(r2d2.make_atari_networks, config.batch_size), config=config, seed=0, num_actors=1, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() # pytype: disable=attribute-error lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def main(_): # Create an environment and grab the spec. env_configs = {'players': FLAGS.num_players} if FLAGS.num_players else {} raw_environment = rl_environment.Environment(FLAGS.game, **env_configs) environment = open_spiel_wrapper.OpenSpielWrapper(raw_environment) environment = wrappers.SinglePrecisionWrapper( environment) # type: open_spiel_wrapper.OpenSpielWrapper environment_spec = acme.make_environment_spec(environment) # Build the networks. networks = [] policy_networks = [] for _ in range(environment.num_players): network = legal_actions.MaskedSequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) policy_network = snt.Sequential([ network, legal_actions.EpsilonGreedy(epsilon=0.1, threshold=-1e8) ]) networks.append(network) policy_networks.append(policy_network) # Construct the agents. agents = [] for network, policy_network in zip(networks, policy_networks): agents.append( dqn.DQN(environment_spec=environment_spec, network=network, policy_network=policy_network)) # Run the environment loop. loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop( environment, agents) loop.run(num_episodes=100000)
def main(_): env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True) env_spec = acme.make_environment_spec(env) config = impala.IMPALAConfig( batch_size=16, sequence_period=10, seed=FLAGS.seed, ) networks = impala.make_atari_networks(env_spec) agent = impala.IMPALAFromConfig( environment_spec=env_spec, forward_fn=networks.forward_fn, unroll_init_fn=networks.unroll_init_fn, unroll_fn=networks.unroll_fn, initial_state_init_fn=networks.initial_state_init_fn, initial_state_fn=networks.initial_state_fn, config=config, ) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def get_env_agent(): """Creates env and agent. Returns: env_acme (acme.wrappers.observation_action_reward. ObservationActionRewardWrappe). agent (acme.agents.tf.r2d2.agent.R2D2). """ # Get environment env_acme = make_environmment() env_spec = acme.make_environment_spec(env_acme) # Create agent and network network = networks.R2D2AtariNetwork(env_spec.actions.num_values) agent = r2d2.R2D2( environment_spec=env_spec, network=network, burn_in_length=2, trace_length=6, replay_period=4, ) return env_acme, agent
def main(_): # Access flag value. level = FLAGS.task environment_factory = ( lambda seed: helpers.make_environment(level=level, oar_wrapper=True)) config = r2d2.R2D2Config() def net_factory(spec: specs.EnvironmentSpec): return r2d2_networks.make_atari_networks(config.batch_size, env_spec=spec) env = environment_factory(False) env_spec = acme.make_environment_spec(env) program = r2d2.DistributedR2D2FromConfig( seed=0, environment_factory=environment_factory, network_factory=net_factory, config=config, num_actors=FLAGS.num_actors, environment_spec=env_spec, ).build() lp.launch(program, lp.LaunchType.LOCAL_MULTI_PROCESSING)
from acme.tf import utils as tf2_utils from acme_dist_toolkit.remote_actors import RemoteRecurrentActor from acme_dist_toolkit.remote_variable_client import RemoteVariableClient from acme_dist_toolkit.remote_environments import create_env_fns # Set gpu config gpus = tf.config.list_physical_devices(device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(device=gpu, enable=True) tf.config.set_visible_devices(devices=gpus[-1], device_type='GPU') ray.init() # Create network env = create_env_fns['r2d2_atari']('PongNoFrameskip-v4') env_spec = acme.make_environment_spec(env) network = networks.R2D2AtariNetwork(env_spec.actions.num_values) tf2_utils.create_variables(network, [env_spec.observations]) # Create a variable replay buffer for sharing parameters # between the learner and the actor variable_server = reverb.Server(tables=[ reverb.Table(name='variable_server', sampler=reverb.selectors.Fifo(), remover=reverb.selectors.Fifo(), max_size=5, rate_limiter=reverb.rate_limiters.MinSize(1)), ]) variable_server_address = f'localhost:{variable_server.port}' variable_client = RemoteVariableClient.remote('variable_server', variable_server_address)
def configure_agent(self, agent_ctor): return agent_ctor(make_environment_spec(self.train_env), self.train_logger)