def run(bsuite_id: str) -> str: """Runs an A2C agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) obs_spec = env.observation_spec() action_spec = env.action_spec() num_actions = env.action_spec().num_values hidden_sizes = [FLAGS.num_units] * FLAGS.num_hidden_layers network = actor_critic_rnn.PolicyValueRNN(hidden_sizes, num_actions) agent = actor_critic_rnn.ActorCriticRNN( obs_spec=obs_spec, action_spec=action_spec, network=network, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), max_sequence_length=FLAGS.sequence_length, td_lambda=FLAGS.td_lambda, discount=FLAGS.discount, seed=FLAGS.seed, ) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs an A2C agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) num_actions = env.action_spec().num_values hidden_sizes = [FLAGS.num_units] * FLAGS.num_hidden_layers network = actor_critic_rnn.PolicyValueRNN(hidden_sizes, num_actions) agent = actor_critic_rnn.ActorCriticRNN( obs_spec=env.observation_spec(), action_spec=env.action_spec(), network=network, optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate), sequence_length=FLAGS.sequence_length, td_lambda=FLAGS.td_lambda, agent_discount=FLAGS.agent_discount, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs A2C agent on a single bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) hidden_sizes = [FLAGS.num_units] * FLAGS.num_hidden_layers network = actor_critic.PolicyValueNet( hidden_sizes=hidden_sizes, action_spec=env.action_spec(), ) agent = actor_critic.ActorCritic( obs_spec=env.observation_spec(), action_spec=env.action_spec(), network=network, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), sequence_length=FLAGS.sequence_length, td_lambda=FLAGS.td_lambda, discount=FLAGS.discount, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def test_run(self, bsuite_id: str): env = bsuite.load_from_id(bsuite_id) agent = actor_critic.default_agent(env.observation_spec(), env.action_spec()) experiment.run(agent=agent, environment=env, num_episodes=5)
def run_random_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) print('bsuite_id={}, settings={}, num_episodes={}'.format( bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes)) agent = random.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec()) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def run(bsuite_id: Text) -> Text: """Runs a BDQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) online_network = dqn.MLP(env.action_spec().num_values) target_network = dqn.MLP(env.action_spec().num_values) agent = dqn.Dqn( obs_spec=env.observation_spec(), action_spec=env.action_spec(), online_network=online_network, target_network=target_network, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=tf.optimizers.Adam(learning_rate=FLAGS.learning_rate), seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def test_run(self, bsuite_id: str): env = bsuite.load_from_id(bsuite_id) agent = boot_dqn.default_agent(env.observation_spec(), env.action_spec(), num_ensemble=2) experiment.run(agent=agent, environment=env, num_episodes=5)
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): # Load environment env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) agent = actor_critic_rnn.default_agent( obs_spec=env.observation_spec(), action_spec=env.action_spec() ) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): # Load environment env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) print('bsuite_id={}, settings={}, num_episodes={}, start={}'.format( bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes, datetime.now().strftime("%H:%M:%S"))) agent = DQNTF2.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec()) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) action_spec = env.action_spec() # Define network. prior_scale = 5. hidden_sizes = [50, 50] def network(inputs: jnp.ndarray) -> jnp.ndarray: """Simple Q-network with randomized prior function.""" net = hk.nets.MLP([*hidden_sizes, action_spec.num_values]) prior_net = hk.nets.MLP([*hidden_sizes, action_spec.num_values]) x = hk.Flatten()(inputs) return net(x) + prior_scale * lax.stop_gradient(prior_net(x)) optimizer = optix.adam(learning_rate=1e-3) agent = boot_dqn.BootstrappedDqn( obs_spec=env.observation_spec(), action_spec=action_spec, network=network, optimizer=optimizer, num_ensemble=FLAGS.num_ensemble, batch_size=128, discount=.99, replay_capacity=10000, min_replay_size=128, sgd_period=1, target_update_period=4, mask_prob=1.0, noise_scale=0., ) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: Text) -> Text: """Runs a BDQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) ensemble = boot_dqn.make_ensemble( num_actions=env.action_spec().num_values, num_ensemble=FLAGS.num_ensemble, num_hidden_layers=FLAGS.num_hidden_layers, num_units=FLAGS.num_units, prior_scale=FLAGS.prior_scale) target_ensemble = boot_dqn.make_ensemble( num_actions=env.action_spec().num_values, num_ensemble=FLAGS.num_ensemble, num_hidden_layers=FLAGS.num_hidden_layers, num_units=FLAGS.num_units, prior_scale=FLAGS.prior_scale) agent = boot_dqn.BootstrappedDqn( obs_spec=env.observation_spec(), action_spec=env.action_spec(), ensemble=ensemble, target_ensemble=target_ensemble, batch_size=FLAGS.batch_size, agent_discount=FLAGS.agent_discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate), mask_prob=FLAGS.mask_prob, noise_scale=FLAGS.noise_scale, epsilon_fn=lambda x: FLAGS.epsilon, seed=FLAGS.seed) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: Text) -> Text: """Runs the agent against the environment specified by `bsuite_id`.""" # Load the environment; here we opt for CSV logging. env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) # Making the networks. hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers online_network = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) target_network = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) agent = dqn.DQN( obs_spec=env.observation_spec(), action_spec=env.action_spec(), online_network=online_network, target_network=target_network, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate), epsilon=FLAGS.epsilon, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = q_learning.QLearning(env) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) # Making the networks. hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers online_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) target_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) agent = dqn.DQNTF2( action_spec=env.action_spec(), online_network=online_network, target_network=target_network, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), epsilon=FLAGS.epsilon, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) layers = [stax.Flatten] for _ in range(FLAGS.num_hidden_layers): layers.append(stax.Dense(FLAGS.num_units)) layers.append(stax.Relu) layers.append(stax.Dense(env.action_spec().num_values)) network_init, network = stax.serial(*layers) _, network_params = network_init(random.PRNGKey(seed=1), (-1, ) + env.observation_spec().shape) agent = dqn.DQNJAX( action_spec=env.action_spec(), network=network, parameters=network_params, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = dqn.default_agent(env.observation_spec(), env.action_spec()) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: Text) -> Text: """Runs a random agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = random.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec(), seed=FLAGS.seed) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: Text) -> Text: """Runs a ISL agent on a given bsuite environment.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) # Making the NNs (q, rho and l). hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers q_network = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]) ]) target_q_network = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]) ]) rho_network = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]) ]) l_network = [[None for _ in range(env.action_spec().num_values)] for _ in range(FLAGS.l_approximators)] target_l_network = [[None for _ in range(env.action_spec().num_values)] for _ in range(FLAGS.l_approximators)] for k in range(FLAGS.l_approximators): for a in range(env.action_spec().num_values): l_network[k][a] = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units, activate_final=True, initializers={'b': tf.constant_initializer(0)}), snt.Linear(1, initializers={'b': tf.constant_initializer(0)}), lambda x: (FLAGS.max_l - FLAGS.min_l) * tf.math.sigmoid(x) + FLAGS.min_l ]) target_l_network[k][a] = snt.Sequential([ snt.BatchFlatten(), snt.nets.MLP(hidden_units, activate_final=True, initializers={'b': tf.constant_initializer(0)}), snt.Linear(1, initializers={'b': tf.constant_initializer(0)}), lambda x: (FLAGS.max_l - FLAGS.min_l) * tf.math.sigmoid(x) + FLAGS.min_l ]) agent = isl.ISL(obs_spec=env.observation_spec(), action_spec=env.action_spec(), q_network=q_network, target_q_network=target_q_network, rho_network=rho_network, l_network=l_network, target_l_network=target_l_network, batch_size=FLAGS.batch_size, discount=FLAGS.agent_discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer_primal=tf.train.AdamOptimizer( learning_rate=FLAGS.q_learning_rate), optimizer_dual=tf.train.AdamOptimizer( learning_rate=FLAGS.rho_learning_rate), optimizer_l=tf.train.AdamOptimizer( learning_rate=FLAGS.l_learning_rate), learn_iters=FLAGS.learn_iters, l_approximators=FLAGS.l_approximators, min_l=FLAGS.min_l, kappa=FLAGS.kappa, eta1=FLAGS.eta1, eta2=FLAGS.eta2, seed=FLAGS.seed) experiment.run(agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """ Runs a bsuite experiment and saves the results as csv files Args: bsuite_id: string, the id of the bsuite experiment to run Returns: none """ env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=save_path, logging_mode='csv', overwrite=True, ) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Settings for the neural network qnet_settings = { 'layers_sizes': [50], 'batch_size': 64, 'noisy_nets': True, 'distributional': True, 'vmin': 0, 'vmax': 1000, 'number_atoms': 51 } # Settings for the specific agent settings = { 'batch_size': qnet_settings["batch_size"], 'epsilon_start': 0.0, 'epsilon_decay': 0.00, 'epsilon_min': 0.00, 'gamma': 0.99, 'buffer_size': 2**16, 'lr': 1e-3, 'qnet_settings': qnet_settings, 'start_optimization': 64, 'update_qnet_every': 2, 'update_target_every': 50, 'ddqn': True, 'n_steps': 4, 'duelling_dqn': True, 'prioritized_buffer': True, 'alpha': 0.6, 'beta0': 0.4, 'beta_increment': 1e-6 } agent = Agent(action_spec=env.action_spec(), observation_spec=env.observation_spec(), device=device, settings=settings) experiment.run(agent=agent, environment=env, num_episodes=env.bsuite_num_episodes, verbose=False) return bsuite_id