def run(bsuite_id: str) -> str: """Runs A2C agent on a single bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) hidden_sizes = [FLAGS.num_units] * FLAGS.num_hidden_layers network = actor_critic.PolicyValueNet( hidden_sizes=hidden_sizes, action_spec=env.action_spec(), ) agent = actor_critic.ActorCritic( obs_spec=env.observation_spec(), action_spec=env.action_spec(), network=network, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), max_sequence_length=FLAGS.sequence_length, td_lambda=FLAGS.td_lambda, discount=FLAGS.discount, seed=FLAGS.seed, ) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run_random_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) print('bsuite_id={}, settings={}, num_episodes={}'.format( bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes)) agent = random.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec()) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): # Load environment env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) agent = actor_critic_rnn.default_agent( obs_spec=env.observation_spec(), action_spec=env.action_spec() ) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True): # Load environment env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite) print('bsuite_id={}, settings={}, num_episodes={}, start={}'.format( bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes, datetime.now().strftime("%H:%M:%S"))) agent = DQNTF2.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec()) experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
def _load_env(): raw_env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) if FLAGS.verbose: raw_env = terminal_logging.wrap_environment(raw_env, log_every=True) # pytype: disable=wrong-arg-types return gym_wrapper.GymFromDMEnv(raw_env)
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) action_spec = env.action_spec() # Define network. prior_scale = 5. hidden_sizes = [50, 50] def network(inputs: jnp.ndarray) -> jnp.ndarray: """Simple Q-network with randomized prior function.""" net = hk.nets.MLP([*hidden_sizes, action_spec.num_values]) prior_net = hk.nets.MLP([*hidden_sizes, action_spec.num_values]) x = hk.Flatten()(inputs) return net(x) + prior_scale * lax.stop_gradient(prior_net(x)) optimizer = optix.adam(learning_rate=1e-3) agent = boot_dqn.BootstrappedDqn( obs_spec=env.observation_spec(), action_spec=action_spec, network=network, optimizer=optimizer, num_ensemble=FLAGS.num_ensemble, batch_size=128, discount=.99, replay_capacity=10000, min_replay_size=128, sgd_period=1, target_update_period=4, mask_prob=1.0, noise_scale=0., ) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" raw_env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) if FLAGS.verbose: raw_env = terminal_logging.wrap_environment(raw_env, log_every=True) # pytype: disable=wrong-arg-types env = gym_wrapper.GymFromDMEnv(raw_env) num_episodes = FLAGS.num_episodes or getattr(raw_env, 'bsuite_num_episodes') def callback(lcl, unused_glb): # Terminate after `num_episodes`. try: return lcl['num_episodes'] > num_episodes except KeyError: return False # Note: we should never run for this many steps as we end after `num_episodes` total_timesteps = FLAGS.total_timesteps deepq.learn( env=env, network='mlp', hiddens=[FLAGS.num_units] * FLAGS.num_hidden_layers, batch_size=FLAGS.batch_size, lr=FLAGS.learning_rate, total_timesteps=total_timesteps, buffer_size=FLAGS.replay_capacity, exploration_fraction=1. / total_timesteps, # i.e. immediately anneal. exploration_final_eps=FLAGS.epsilon, # constant epsilon. print_freq=None, # pylint: disable=wrong-arg-types learning_starts=FLAGS.min_replay_size, target_network_update_freq=FLAGS.target_update_period, callback=callback, # pytype: disable=wrong-arg-types gamma=FLAGS.agent_discount, checkpoint_freq=None, ) return bsuite_id
def __init__(self, id: str, exp_kwargs: dict = None, external_logging: str = 'none', save_path: str = '', overwrite: bool = True): assert (id in VALID_ENV_SWEEP_IDS) or ( id in VALID_ENV_IDS and exp_kwargs is not None ) # Either using one of presets or using base experiment with other settings aug_path = osp.join(LOG_DIR, save_path) # LOG_DIR + save_path if id in VALID_ENV_SWEEP_IDS: # Pre-parameterized experiments if external_logging == 'none': env = bsuite.load_from_id(id) # No recording else: env = bsuite.load_and_record( id, aug_path, external_logging, overwrite=overwrite ) # Record in sql or csv. same sql for each id self.num_episodes = env.bsuite_num_episodes else: noise_scale = exp_kwargs.pop('noise_scale', 0.) noise_scale_seed = exp_kwargs.pop('noise_scale_seed', 0.) reward_scale = exp_kwargs.pop('reward_scale', 0.) env = bsuite.load(id, **exp_kwargs) if noise_scale: env = RewardNoise(env, noise_scale, noise_scale_seed) if reward_scale: env = RewardScale(env, reward_scale) self.num_episodes = 1e4 # Default self.env = env self._action_space = IntBox(low=0, high=self.env.action_spec().num_values) o_spec = self.env.observation_spec() if isinstance(o_spec, specs.BoundedArray): self._observation_space = FloatBox(low=o_spec.minimum.item(), high=o_spec.maximum.item(), shape=o_spec.shape, dtype=o_spec.dtype) else: self._observation_space = FloatBox(low=-float('inf'), high=float('inf'), shape=o_spec.shape, dtype=o_spec.dtype) self._last_observation = None self.game_over = False, self.viewer = None
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = q_learning.QLearning(env) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = dqn.default_agent(env.observation_spec(), env.action_spec()) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a random agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) agent = random.default_agent(obs_spec=env.observation_spec(), action_spec=env.action_spec(), seed=FLAGS.seed) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) # Making the networks. hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) optimizer = snt.optimizers.Adam(learning_rate=FLAGS.learning_rate) agent = dqn.DQN( action_spec=env.action_spec(), network=network, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=optimizer, epsilon=FLAGS.epsilon, seed=FLAGS.seed, ) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run( agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """Runs a BDQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) ensemble = boot_dqn.make_ensemble( num_actions=env.action_spec().num_values, num_ensemble=FLAGS.num_ensemble, num_hidden_layers=FLAGS.num_hidden_layers, num_units=FLAGS.num_units, prior_scale=FLAGS.prior_scale) agent = boot_dqn.BootstrappedDqn( obs_spec=env.observation_spec(), action_spec=env.action_spec(), ensemble=ensemble, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), mask_prob=FLAGS.mask_prob, noise_scale=FLAGS.noise_scale, epsilon_fn=lambda x: FLAGS.epsilon, seed=FLAGS.seed) num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes') experiment.run(agent=agent, environment=env, num_episodes=num_episodes, verbose=FLAGS.verbose) return bsuite_id
def run(bsuite_id: str) -> str: """ Runs a bsuite experiment and saves the results as csv files Args: bsuite_id: string, the id of the bsuite experiment to run Returns: none """ env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=save_path, logging_mode='csv', overwrite=True, ) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Settings for the neural network qnet_settings = { 'layers_sizes': [50], 'batch_size': 64, 'noisy_nets': True, 'distributional': True, 'vmin': 0, 'vmax': 1000, 'number_atoms': 51 } # Settings for the specific agent settings = { 'batch_size': qnet_settings["batch_size"], 'epsilon_start': 0.0, 'epsilon_decay': 0.00, 'epsilon_min': 0.00, 'gamma': 0.99, 'buffer_size': 2**16, 'lr': 1e-3, 'qnet_settings': qnet_settings, 'start_optimization': 64, 'update_qnet_every': 2, 'update_target_every': 50, 'ddqn': True, 'n_steps': 4, 'duelling_dqn': True, 'prioritized_buffer': True, 'alpha': 0.6, 'beta0': 0.4, 'beta_increment': 1e-6 } agent = Agent(action_spec=env.action_spec(), observation_spec=env.observation_spec(), device=device, settings=settings) experiment.run(agent=agent, environment=env, num_episodes=env.bsuite_num_episodes, verbose=False) return bsuite_id
def run(bsuite_id: str) -> str: """Runs Dopamine DQN on a given bsuite environment, logging to CSV.""" raw_env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) class Network(tf.keras.Model): """Build deep network compatible with dopamine/discrete_domains/gym_lib.""" def __init__(self, num_actions: int, name='Network'): super(Network, self).__init__(name=name) self.forward_fn = tf.keras.Sequential( [tf.keras.layers.Flatten()] + [ tf.keras.layers.Dense(FLAGS.num_units, activation=tf.keras.activations.relu) for _ in range(FLAGS.num_hidden_layers) ] + [tf.keras.layers.Dense(num_actions, activation=None)]) def call(self, state): """Creates the output tensor/op given the state tensor as input.""" x = tf.cast(state, tf.float32) x = self.forward_fn(x) return atari_lib.DQNNetworkType(x) def create_agent(sess: tf.Session, environment: gym.Env, summary_writer=None): """Factory method for agent initialization in Dopmamine.""" del summary_writer return dqn_agent.DQNAgent( sess=sess, num_actions=environment.action_space.n, observation_shape=OBSERVATION_SHAPE, observation_dtype=tf.float32, stack_size=1, network=Network, gamma=FLAGS.agent_discount, update_horizon=1, min_replay_history=FLAGS.min_replay_size, update_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, epsilon_decay_period=FLAGS.epsilon_decay_period, epsilon_train=FLAGS.epsilon, optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate), ) def create_environment() -> gym.Env: """Factory method for environment initialization in Dopmamine.""" env = wrappers.ImageObservation(raw_env, OBSERVATION_SHAPE) if FLAGS.verbose: env = terminal_logging.wrap_environment(env, log_every=True) # pytype: disable=wrong-arg-types env = gym_wrapper.GymFromDMEnv(env) env.game_over = False # Dopamine looks for this return env runner = run_experiment.Runner( base_dir=FLAGS.base_dir, create_agent_fn=create_agent, create_environment_fn=create_environment, ) num_episodes = FLAGS.num_episodes or getattr(raw_env, 'bsuite_num_episodes') for _ in range(num_episodes): runner._run_one_episode() # pylint: disable=protected-access return bsuite_id
if not os.path.exists(result_path): os.makedirs(result_path) if not os.path.exists(agent_path): os.makedirs(agent_path) # use cartpole_swingup/19 from bsuite, set env. parameters to the ones used in the paper: bsuite_id = 'cartpole_swingup/19' sweep.SETTINGS[bsuite_id]['x_reward_threshold'] = 1.0 sweep.SETTINGS[bsuite_id]['x_threshold'] = 5. sweep.SETTINGS[bsuite_id]['move_cost'] = 0.05 # train agent over multiple seeds for seed in trange(81, 86, 1): env = bsuite.load_and_record(bsuite_id, result_path + str(seed), overwrite=True) np.random.seed(seed) torch.manual_seed(seed) agent = IndDQNAgent( action_set=[0, 1, 2], reward_function=functools.partial(cartpole_reward_function, reward_type='sparse'), feature_extractor=CartpoleIdentityFeature( ), # use feature from bsuite without any modification hidden_dims=[50, 50, 50], hidden_dims_std=[50, 50, 50], learning_rate=1e-3, buffer_size=int(1e6), batch_size=64,