def train_eval( root_dir, env_name='CartPole-v0', num_iterations=100000, fc_layer_params=(100, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for checkpoints, summaries, and logging train_checkpoint_interval=10000, policy_checkpoint_interval=5000, rb_checkpoint_interval=20000, log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.contrib.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] # TODO(kbanoop): Figure out if it is possible to avoid the with block. with tf.contrib.summary.record_summaries_every_n_global_steps( summary_interval): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_py_env = suite_gym.load(env_name) q_net = q_network.QNetwork(tf_env.time_step_spec().observation, tf_env.action_spec(), fc_layer_params=fc_layer_params) tf_agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), # TODO(kbanoop): Decay epsilon based on global step, cf. cl/188907839 epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec(), batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy()) train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] global_step = tf.train.get_or_create_global_step() replay_observer = [replay_buffer.add_batch] initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) initial_collect_op = dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=replay_observer, num_steps=initial_collect_steps).run() collect_policy = tf_agent.collect_policy() collect_op = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=replay_observer + train_metrics, num_steps=collect_steps_per_iteration).run() # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = dataset.make_initializable_iterator() trajectories, _ = iterator.get_next() train_op = tf_agent.train(experience=trajectories, train_step_counter=global_step) train_checkpointer = common_utils.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=tf.contrib.checkpoint.List(train_metrics)) policy_checkpointer = common_utils.Checkpointer( ckpt_dir=os.path.join(train_dir, 'policy'), policy=tf_agent.policy(), global_step=global_step) rb_checkpointer = common_utils.Checkpointer( ckpt_dir=os.path.join(train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) for train_metric in train_metrics: train_metric.tf_summaries(step_metrics=train_metrics[:2]) summary_op = tf.contrib.summary.all_summary_ops() with eval_summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): for eval_metric in eval_metrics: eval_metric.tf_summaries() init_agent_op = tf_agent.initialize() with tf.Session() as sess: # Initialize the graph. train_checkpointer.initialize_or_restore(sess) rb_checkpointer.initialize_or_restore(sess) sess.run(iterator.initializer) # TODO(sguada) Remove once Periodically can be saved. common_utils.initialize_uninitialized_variables(sess) sess.run(init_agent_op) tf.contrib.summary.initialize(session=sess) sess.run(initial_collect_op) global_step_val = sess.run(global_step) metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, ) collect_call = sess.make_callable(collect_op) train_step_call = sess.make_callable( [train_op, summary_op, global_step]) timed_at_step = sess.run(global_step) collect_time = 0 train_time = 0 steps_per_second_ph = tf.placeholder(tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.contrib.summary.scalar( name='global_steps/sec', tensor=steps_per_second_ph) for _ in range(num_iterations): # Train/collect/eval. start_time = time.time() collect_call() collect_time += time.time() - start_time start_time = time.time() for _ in range(train_steps_per_iteration): loss_info_value, _, global_step_val = train_step_call() train_time += time.time() - start_time if global_step_val % log_interval == 0: tf.logging.info('step = %d, loss = %f', global_step_val, loss_info_value.loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) sess.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) tf.logging.info('%.3f steps/sec' % steps_per_sec) tf.logging.info( 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, )
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=1000, actor_fc_layers=(100,), value_net_fc_layers=(100,), use_value_network=False, use_tf_functions=True, # Params for collect collect_episodes_per_iteration=2, replay_buffer_capacity=2000, # Params for train learning_rate=1e-3, gamma=0.9, gradient_clipping=None, normalize_returns=True, value_estimation_loss_coef=0.2, # Params for eval num_eval_episodes=10, eval_interval=100, # Params for checkpoints, summaries, and logging log_interval=100, summary_interval=100, summaries_flush_secs=1, debug_summaries=True, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for Reinforce.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.time_step_spec().observation, tf_env.action_spec(), fc_layer_params=actor_fc_layers) if use_value_network: value_net = value_network.ValueNetwork( tf_env.time_step_spec().observation, fc_layer_params=value_net_fc_layers) global_step = tf.compat.v1.train.get_or_create_global_step() tf_agent = reinforce_agent.ReinforceAgent( tf_env.time_step_spec(), tf_env.action_spec(), actor_network=actor_net, value_network=value_net if use_value_network else None, value_estimation_loss_coef=value_estimation_loss_coef, gamma=gamma, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate), normalize_returns=normalize_returns, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) tf_agent.initialize() train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration) def train_step(): experience = replay_buffer.gather_all() return tf_agent.train(experience) if use_tf_functions: # To speed up collect use TF function. collect_driver.run = common.function(collect_driver.run) # To speed up train use TF function. tf_agent.train = common.function(tf_agent.train) train_step = common.function(train_step) # Compute evaluation metrics. metrics = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) # TODO(b/126590894): Move this functionality into eager_compute_summaries if eval_metrics_callback is not None: eval_metrics_callback(metrics, global_step.numpy()) time_step = None policy_state = collect_policy.get_initial_state(tf_env.batch_size) timed_at_step = global_step.numpy() time_acc = 0 for _ in range(num_iterations): start_time = time.time() time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) total_loss = train_step() replay_buffer.clear() time_acc += time.time() - start_time global_step_val = global_step.numpy() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss.loss) steps_per_sec = (global_step_val - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_sec, step=global_step) timed_at_step = global_step_val time_acc = 0 for train_metric in train_metrics: train_metric.tf_summaries( train_step=global_step, step_metrics=train_metrics[:2]) if global_step_val % eval_interval == 0: metrics = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) # TODO(b/126590894): Move this functionality into # eager_compute_summaries. if eval_metrics_callback is not None: eval_metrics_callback(metrics, global_step_val)
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=1000, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(100, ), # Params for collect collect_episodes_per_iteration=2, replay_buffer_capacity=2000, # Params for train learning_rate=1e-3, gradient_clipping=None, normalize_returns=True, # Params for eval num_eval_episodes=10, eval_interval=100, # Params for checkpoints, summaries, and logging train_checkpoint_interval=100, policy_checkpoint_interval=100, rb_checkpoint_interval=200, log_interval=100, summary_interval=100, summaries_flush_secs=1, debug_summaries=True, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for Reinforce.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): eval_py_env = suite_gym.load(env_name) tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) # TODO(b/127870767): Handle distributions without gin. actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.time_step_spec().observation, tf_env.action_spec(), fc_layer_params=actor_fc_layers) tf_agent = reinforce_agent.ReinforceAgent( tf_env.time_step_spec(), tf_env.action_spec(), actor_network=actor_net, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), normalize_returns=normalize_returns, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] collect_policy = tf_agent.collect_policy collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration).run() experience = replay_buffer.gather_all() train_op = tf_agent.train(experience) clear_rb_op = replay_buffer.clear() train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=tf_agent.policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) summary_ops = [] for train_metric in train_metrics: summary_ops.append( train_metric.tf_summaries(train_step=global_step, step_metrics=train_metrics[:2])) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries(train_step=global_step) init_agent_op = tf_agent.initialize() with tf.compat.v1.Session() as sess: # Initialize the graph. train_checkpointer.initialize_or_restore(sess) rb_checkpointer.initialize_or_restore(sess) # TODO(b/126239733): Remove once Periodically can be saved. common.initialize_uninitialized_variables(sess) sess.run(init_agent_op) sess.run(train_summary_writer.init()) sess.run(eval_summary_writer.init()) # Compute evaluation metrics. global_step_call = sess.make_callable(global_step) global_step_val = global_step_call() metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, ) collect_call = sess.make_callable(collect_op) train_step_call = sess.make_callable([train_op, summary_ops]) clear_rb_call = sess.make_callable(clear_rb_op) timed_at_step = global_step_call() time_acc = 0 steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_second_ph, step=global_step) for _ in range(num_iterations): start_time = time.time() collect_call() total_loss, _ = train_step_call() clear_rb_call() time_acc += time.time() - start_time global_step_val = global_step_call() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss.loss) steps_per_sec = (global_step_val - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) sess.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) timed_at_step = global_step_val time_acc = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, )
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=100000, fc_layer_params=(100, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for summaries and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.contrib.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] with tf.contrib.summary.record_summaries_every_n_global_steps( summary_interval): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load(env_name)) trajectory_spec = trajectory.from_transition( time_step=tf_env.time_step_spec(), action_step=policy_step.PolicyStep(action=tf_env.action_spec()), next_time_step=tf_env.time_step_spec()) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=trajectory_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) q_net = q_network.QNetwork(tf_env.time_step_spec().observation, tf_env.action_spec(), fc_layer_params=fc_layer_params) tf_agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, # TODO(kbanoop): Decay epsilon based on global step, cf. cl/188907839 epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), td_errors_loss_fn=dqn_agent.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars) tf_agent.initialize() train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy() collect_policy = tf_agent.collect_policy() collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=collect_steps_per_iteration) global_step = tf.train.get_or_create_global_step() initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) # Collect initial replay data. tf.logging.info( 'Initializing replay buffer by collecting experience for %d steps with ' 'a random policy.' % initial_collect_steps) dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch], num_steps=initial_collect_steps).run() metrics = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(metrics, global_step.numpy()) time_step = None policy_state = () timed_at_step = global_step.numpy() time_acc = 0 # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) for _ in range(num_iterations): start_time = time.time() time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) for _ in range(train_steps_per_iteration): experience, _ = next(iterator) train_loss = tf_agent.train(experience, train_step_counter=global_step) time_acc += time.time() - start_time if global_step.numpy() % log_interval == 0: tf.logging.info('step = %d, loss = %f', global_step.numpy(), train_loss) steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc tf.logging.info('%.3f steps/sec' % steps_per_sec) tf.contrib.summary.scalar(name='global_steps/sec', tensor=steps_per_sec) timed_at_step = global_step.numpy() time_acc = 0 for train_metric in train_metrics: train_metric.tf_summaries(step_metrics=train_metrics[:2]) if global_step.numpy() % eval_interval == 0: metrics = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(metrics, global_step.numpy()) return train_loss
def test_custom_max_steps(self): env = suite_gym.load('CartPole-v1', max_episode_steps=5) self.assertIsInstance(env, py_environment.PyEnvironment) self.assertIsInstance(env, wrappers.TimeLimit) self.assertEqual(5, env._duration)
def gen_env(self): display = suite_gym.load(self.name) env = tf_py_environment.TFPyEnvironment(display) return env, display
batch_size = 64 # @param {type:"integer"} learning_rate = 1e-3 # @param {type:"number"} log_interval = 200 # @param {type:"integer"} num_eval_episodes = 10 # @param {type:"integer"} eval_interval = 1000 # @param {type:"integer"} """## Environment In Reinforcement Learning (RL), an environment represents the task or problem to be solved. Standard environments can be created in TF-Agents using `tf_agents.environments` suites. TF-Agents has suites for loading environments from sources such as the OpenAI Gym, Atari, and DM Control. Load the CartPole environment from the OpenAI Gym suite. """ env_name = 'CartPole-v0' env = suite_gym.load(env_name) """You can render this environment to see how it looks. A free-swinging pole is attached to a cart. The goal is to move the cart right or left in order to keep the pole pointing up.""" #@test {"skip": true} env.reset() PIL.Image.fromarray(env.render()) """The `environment.step` method takes an `action` in the environment and returns a `TimeStep` tuple containing the next observation of the environment and the reward for the action. The `time_step_spec()` method returns the specification for the `TimeStep` tuple. Its `observation` attribute shows the shape of observations, the data types, and the ranges of allowed values. The `reward` attribute shows the same details for the reward. """ print('Observation Spec:') print(env.time_step_spec().observation) print('Reward Spec:') print(env.time_step_spec().reward)
def test_load_adds_time_limit_steps(self): env = suite_gym.load('CartPole-v1') self.assertIsInstance(env, py_environment.PyEnvironment) self.assertIsInstance(env, wrappers.TimeLimit)
def main(): parser = argparse.ArgumentParser() ## Essential parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model stats and checkpoints will be written." ) parser.add_argument("--env", default=None, type=str, required=True, help="The environment to train the agent on") parser.add_argument("--approx_env_boundaries", default=False, type=bool, help="Whether to get the env boundaries approximately") parser.add_argument("--max_horizon", default=5, type=int) parser.add_argument("--atari", default=False, type=bool, help="Gets some data Types correctly") ##agent parameters parser.add_argument("--reward_scale_factor", default=1.0, type=float) parser.add_argument("--debug_summaries", default=False, type=bool) parser.add_argument("--summarize_grads_and_vars", default=False, type=bool) ##transformer parameters parser.add_argument("--d_model", default=64, type=int) parser.add_argument("--num_layers", default=3, type=int) parser.add_argument("--dff", default=256, type=int) ##Training parameters parser.add_argument('--num_iterations', type=int, default=150000, help="steps in the env") parser.add_argument('--num_iparallel', type=int, default=1, help="how many envs should run in parallel") parser.add_argument("--collect_steps_per_iteration", default=1, type=int) parser.add_argument("--train_steps_per_iteration", default=1, type=int) ## Other parameters parser.add_argument("--num_eval_episodes", default=10, type=int) parser.add_argument("--eval_interval", default=1000, type=int) parser.add_argument("--log_interval", default=1000, type=int) parser.add_argument("--summary_interval", default=1000, type=int) parser.add_argument("--run_graph_mode", default=True, type=bool) parser.add_argument("--checkpoint_interval", default=10000, type=int) parser.add_argument("--summary_flush", default=10, type=int) #what does this exactly do? # HP opt params parser.add_argument("--doubleQ", default=True, type=bool, help="Whether to use a DoubleQ agent") parser.add_argument("--custom_last_layer", default=False, type=bool) parser.add_argument("--custom_layer_init", default=1, type=float) parser.add_argument("--initial_collect_steps", default=500, type=int) parser.add_argument("--loss_function", default='element_wise_squared_loss', type=str) parser.add_argument("--num_heads", default=4, type=int) parser.add_argument("--normalize_env", default=False, type=bool) parser.add_argument('--custom_lr_schedule', default="No", type=str, help="whether to use a custom LR schedule") parser.add_argument("--epsilon_greedy", default=0.1, type=float) parser.add_argument("--target_update_period", default=10, type=int) parser.add_argument( "--rate", default=0.1, type=float ) # dropout rate (might be not used depending on the q network) #Setting this to 0.0 somehow break the code. Not relevant tho just select a network without dropout parser.add_argument("--gradient_clipping", default=True, type=bool) parser.add_argument("--replay_buffer_max_length", default=100000, type=int) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--learning_rate", default=1e-5, type=float) parser.add_argument("--encoder_type", default=3, type=int, help="Which Type of encoder is used for the model") parser.add_argument("--layer_type", default=3, type=int, help="Which Type of layer is used for the encoder") parser.add_argument("--target_update_tau", default=1, type=float) parser.add_argument("--gamma", default=0.95, type=float) args = parser.parse_args() # List of encoder modules which we can use to change encoder based on a variable global_step = tf.compat.v1.train.get_or_create_global_step() baseEnv = gym.make(args.env) env = suite_gym.load(args.env) eval_env = suite_gym.load(args.env) if args.normalize_env == True: env = NormalizeWrapper(env, args.approx_env_boundaries, args.env) eval_env = NormalizeWrapper(eval_env, args.approx_env_boundaries, args.env) env = PyhistoryWrapper(env, args.max_horizon, args.atari) eval_env = PyhistoryWrapper(eval_env, args.max_horizon, args.atari) tf_env = tf_py_environment.TFPyEnvironment(env) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env) q_net = QTransformer(tf_env.observation_spec(), baseEnv.action_space.n, num_layers=args.num_layers, d_model=args.d_model, num_heads=args.num_heads, dff=args.dff, rate=args.rate, encoderType=args.encoder_type, enc_layer_type=args.layer_type, max_horizon=args.max_horizon, custom_layer=args.custom_layer_init, custom_last_layer=args.custom_last_layer) if args.custom_lr_schedule == "Transformer": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(args.d_model, int(args.num_iterations / 10)) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Transformer_low": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule( int(args.d_model / 2), int(args.num_iterations / 10)) # --> same schedule with lower general lr optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Linear": lrs = LinearCustomSchedule(learning_rate, args.num_iterations) optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9) else: optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=args.learning_rate) if args.loss_function == "element_wise_huber_loss": lf = element_wise_huber_loss elif args.loss_function == "element_wise_squared_loss": lf = element_wise_squared_loss if args.doubleQ == False: # global step count agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=args.epsilon_greedy, #boltzmann_temperature = 1, target_update_tau=args.target_update_tau, target_update_period=args.target_update_period, td_errors_loss_fn=lf, optimizer=optimizer, gamma=args.gamma, reward_scale_factor=args.reward_scale_factor, gradient_clipping=args.gradient_clipping, debug_summaries=args.debug_summaries, summarize_grads_and_vars=args.summarize_grads_and_vars, train_step_counter=global_step) else: agent = dqn_agent.DdqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=args.epsilon_greedy, #boltzmann_temperature = 1, target_update_tau=args.target_update_tau, td_errors_loss_fn=lf, target_update_period=args.target_update_period, optimizer=optimizer, gamma=args.gamma, reward_scale_factor=args.reward_scale_factor, gradient_clipping=args.gradient_clipping, debug_summaries=args.debug_summaries, summarize_grads_and_vars=args.summarize_grads_and_vars, train_step_counter=global_step) agent.initialize() count_weights(q_net) train_eval(root_dir=args.output_dir, tf_env=tf_env, eval_tf_env=eval_tf_env, agent=agent, num_iterations=args.num_iterations, initial_collect_steps=args.initial_collect_steps, collect_steps_per_iteration=args.collect_steps_per_iteration, replay_buffer_capacity=args.replay_buffer_max_length, train_steps_per_iteration=args.train_steps_per_iteration, batch_size=args.batch_size, use_tf_functions=args.run_graph_mode, num_eval_episodes=args.num_eval_episodes, eval_interval=args.eval_interval, train_checkpoint_interval=args.checkpoint_interval, policy_checkpoint_interval=args.checkpoint_interval, rb_checkpoint_interval=args.checkpoint_interval, log_interval=args.log_interval, summary_interval=args.summary_interval, summaries_flush_secs=args.summary_flush) pickle.dump(args, open(args.output_dir + "/training_args.p", "wb")) print("Successfully trained and evaluation.")
agentsDict, nameDict, networkDict, numAgents, loadingSeriesHP, chargingSeriesEV, \ genSeriesPV, genSeriesWind, loadingSeriesDSM = util.agentsInit(alg, startDay, endDay, numAgentsEachType=1) nameList = [agent.id for agent in agentsDict.values()] typeList = [agent.type for agent in agentsDict.values()] grid = Grid(numAgents, nameList, loadingSeriesHP, chargingSeriesEV, genSeriesPV, genSeriesWind, loadingSeriesDSM, numCPU=20) sm = SpotMarket() agentsList = [obj for name, obj in agentsDict.items()] sm.addParticipants(agentsList) dso = DSO(grid, startDay, endDay) dso.addflexAgents(agentsList) """load the train Gym environment""" env = suite_gym.load("gym_LocalFlexMarketEnv:LocalFlexMarketEnv-v0", gym_kwargs={'SpotMarket': sm, 'DSO': dso, 'alg': alg}) env.gym.startDay = startDay env.gym.endDay = endDay env.reset() train_env = tf_py_environment.TFPyEnvironment(env) agents = train_env.pyenv.envs[0].gym.agents if not os.path.exists("../results/" + alg): os.makedirs("../results/" + alg) filename = "../results/" + alg + "/agentList.pkl" with open(filename, "wb") as f: pickle.dump(nameList, f) """Training parameters""" num_iterations = 1
def train_eval( root_dir, env_name='gym_orbital_system:earth-v0', num_iterations=500000, actor_fc_layers=(400, 1000, 1000, 1000, 1000, 400), critic_obs_fc_layers=( 400, 1000, 1000, 400, ), critic_action_fc_layers=None, critic_joint_fc_layers=(400, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, replay_buffer_capacity=100000, exploration_noise_std=0.1, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, actor_update_period=2, actor_learning_rate=1e-4, critic_learning_rate=1e-3, td_errors_loss_fn=tf.compat.v1.losses.huber_loss, gamma=0.995, reward_scale_factor=1.0, gradient_clipping=None, use_tf_functions=True, # Params for eval num_eval_episodes=10, eval_interval=10000, # Params for checkpoints, summaries, and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for TD3.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load(env_name)) actor_net = actor_network.ActorNetwork( tf_env.time_step_spec().observation, tf_env.action_spec(), fc_layer_params=actor_fc_layers, ) critic_net_input_specs = (tf_env.time_step_spec().observation, tf_env.action_spec()) critic_net = critic_network.CriticNetwork( critic_net_input_specs, observation_fc_layer_params=critic_obs_fc_layers, action_fc_layer_params=critic_action_fc_layers, joint_fc_layer_params=critic_joint_fc_layers, ) tf_agent = td3_agent.Td3Agent( tf_env.time_step_spec(), tf_env.action_spec(), actor_network=actor_net, critic_network=critic_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=actor_learning_rate), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=critic_learning_rate), exploration_noise_std=exploration_noise_std, target_update_tau=target_update_tau, target_update_period=target_update_period, actor_update_period=actor_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step, ) tf_agent.initialize() train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) initial_collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch], num_steps=initial_collect_steps) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=collect_steps_per_iteration) if use_tf_functions: initial_collect_driver.run = common.function( initial_collect_driver.run) collect_driver.run = common.function(collect_driver.run) tf_agent.train = common.function(tf_agent.train) # Collect initial replay data. logging.info( 'Initializing replay buffer by collecting experience for %d steps with ' 'a random policy.', initial_collect_steps) initial_collect_driver.run() results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step.numpy()) metric_utils.log_metrics(eval_metrics) time_step = None policy_state = collect_policy.get_initial_state(tf_env.batch_size) timed_at_step = global_step.numpy() time_acc = 0 # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) def train_step(): experience, _ = next(iterator) return tf_agent.train(experience) if use_tf_functions: train_step = common.function(train_step) for _ in range(num_iterations): start_time = time.time() time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) for _ in range(train_steps_per_iteration): train_loss = train_step() time_acc += time.time() - start_time if global_step.numpy() % log_interval == 0: logging.info('step = %d, loss = %f', global_step.numpy(), train_loss.loss) steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) timed_at_step = global_step.numpy() time_acc = 0 for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=train_metrics[:2]) if global_step.numpy() % eval_interval == 0: results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step.numpy()) metric_utils.log_metrics(eval_metrics) return train_loss
from utils import compute_avg_return from tf_agents.environments import suite_gym from tf_agents.policies import random_tf_policy from tf_agents.environments import tf_py_environment from tf_agents.metrics.tf_metrics import AverageReturnMetric from drivers import TFRenderDriver if __name__ == '__main__': py_env = suite_gym.load('CartPole-v0') py_env.render(mode="human") env = tf_py_environment.TFPyEnvironment(py_env) policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(), env.action_spec()) average = AverageReturnMetric() metrics_observer = [average] metrics_driver = TFRenderDriver(env, policy, metrics_observer, max_episodes=5) time_step = env.reset() policy_state = policy.get_initial_state(batch_size=1) average.reset()
# num_iterations = 10000 # @param {type:"integer"} initial_collect_steps = 1000 # @param {type:"integer"} collect_steps_per_iteration = 1 # @param {type:"integer"} replay_buffer_max_length = 100000 # @param {type:"integer"} batch_size = 64 # @param {type:"integer"} learning_rate = 1e-3 # @param {type:"number"} log_interval = 200 # @param {type:"integer"} num_eval_episodes = 10 # @param {type:"integer"} eval_interval = 1000 # @param {type:"integer"} env_name = 'CartPole-v0' env = suite_gym.load(env_name) #@test {"skip": true} env.reset() # PIL.Image.fromarray(env.render()) print('\nObservation Spec:') print(env.time_step_spec().observation) print('\nminimum') print(*env.time_step_spec().observation.minimum,sep='\n') print('\nmaximum') print(*env.time_step_spec().observation.maximum,sep='\n') print('\nReward Spec:')
def make_env(): return tf_py_environment.TFPyEnvironment(suite_gym.load('CartPole-v0'))
from tf_agents.drivers import dynamic_step_driver from tf_agents.environments import suite_gym from tf_agents.environments import tf_py_environment from tf_agents.eval import metric_utils from tf_agents.metrics import tf_metrics from tf_agents.networks import q_network from tf_agents.policies import random_tf_policy, epsilon_greedy_policy, random_tf_policy from tf_agents.replay_buffers import tf_uniform_replay_buffer from tf_agents.trajectories import trajectory from tf_agents.utils import common from Environment import BreakoutEnv env_name = 'Breakout-v0' train_py_env = BreakoutEnv(suite_gym.load(env_name)) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_py_env = BreakoutEnv(suite_gym.load(env_name)) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) learning_rate = 1e-5 # @param {type:"number"} replay_buffer_max_length = 10000 # @param {type:"integer"} num_iterations = 10000 # @param {type:"integer"} initial_collect_steps = 2000 # @param {type:"integer"} collect_steps_per_iteration = 1 # @param {type:"integer"} batch_size = 32 # @param {type:"integer"} log_interval = 200 # @param {type:"integer"}
def train_eval( root_dir, env_name='MaskedCartPole-v0', num_iterations=100000, input_fc_layer_params=(50, ), lstm_size=(20, ), output_fc_layer_params=(20, ), train_sequence_length=10, # Params for collect initial_collect_steps=50, collect_episodes_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=10, batch_size=128, learning_rate=1e-3, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for summaries and logging train_checkpoint_interval=10000, policy_checkpoint_interval=5000, rb_checkpoint_interval=20000, log_interval=100, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): eval_py_env = suite_gym.load(env_name) tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) q_net = q_rnn_network.QRnnNetwork( tf_env.time_step_spec().observation, tf_env.action_spec(), input_fc_layer_params=input_fc_layer_params, lstm_size=lstm_size, output_fc_layer_params=output_fc_layer_params) tf_agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), # TODO(kbanoop): Decay epsilon based on global step, cf. cl/188907839 epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) initial_collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=initial_collect_steps).run() collect_policy = tf_agent.collect_policy collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration).run() # Need extra step to generate transitions of train_sequence_length. # Dataset generates trajectories with shape [BxTx...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=train_sequence_length + 1).prefetch(3) iterator = tf.compat.v1.data.make_initializable_iterator(dataset) experience, _ = iterator.get_next() loss_info = tf_agent.train(experience=experience) train_checkpointer = common_utils.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common_utils.Checkpointer( ckpt_dir=os.path.join(train_dir, 'policy'), policy=tf_agent.policy, global_step=global_step) rb_checkpointer = common_utils.Checkpointer( ckpt_dir=os.path.join(train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) for train_metric in train_metrics: train_metric.tf_summaries(step_metrics=train_metrics[:2]) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries() init_agent_op = tf_agent.initialize() with tf.compat.v1.Session() as sess: sess.run(train_summary_writer.init()) sess.run(eval_summary_writer.init()) # Initialize the graph. train_checkpointer.initialize_or_restore(sess) rb_checkpointer.initialize_or_restore(sess) sess.run(iterator.initializer) # TODO(sguada) Remove once Periodically can be saved. common_utils.initialize_uninitialized_variables(sess) sess.run(init_agent_op) logging.info('Collecting initial experience.') sess.run(initial_collect_op) # Compute evaluation metrics. global_step_val = sess.run(global_step) metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) collect_call = sess.make_callable(collect_op) train_step_call = sess.make_callable(loss_info) global_step_call = sess.make_callable(global_step) timed_at_step = global_step_call() time_acc = 0 steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.contrib.summary.scalar( name='global_steps/sec', tensor=steps_per_second_ph) for _ in range(num_iterations): # Train/collect/eval. start_time = time.time() collect_call() for _ in range(train_steps_per_iteration): loss_info_value = train_step_call() time_acc += time.time() - start_time global_step_val = global_step_call() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, loss_info_value.loss) steps_per_sec = (global_step_val - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) sess.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) timed_at_step = global_step_val time_acc = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, log=True, callback=eval_metrics_callback, )
def load_multiple_mugs_env( universe, action_mode, env_name=None, render_size=128, observation_render_size=64, observations_whitelist=None, action_repeat=1, num_train_tasks=30, num_eval_tasks=10, eval_on_holdout_tasks=True, return_multiple_tasks=False, model_input=None, auto_reset_task_each_episode=False, ): ### HARDCODED # temporary sanity assert env_name == 'SawyerShelfMT-v0' assert return_multiple_tasks assert universe == 'gym' # get eval and train tasks by loading a sample env sample_env = suite_mujoco.load(env_name) # train env train_tasks = sample_env.init_tasks(num_tasks=num_train_tasks, is_eval_env=False) # eval env eval_tasks = sample_env.init_tasks(num_tasks=num_eval_tasks, is_eval_env=eval_on_holdout_tasks) del sample_env print("train weights", train_tasks) print("eval weights", eval_tasks) if env_name == 'SawyerShelfMT-v0': from meld.environments.envs.shelf.assets.generate_sawyer_shelf_xml import generate_and_save_xml_file else: raise NotImplementedError train_xml_path = generate_and_save_xml_file(train_tasks, action_mode, is_eval=False) eval_xml_path = generate_and_save_xml_file(eval_tasks, action_mode, is_eval=True) ### train env # get wrappers wrappers = get_wrappers(device_id=0, model_input=model_input, render_size=render_size, observation_render_size=observation_render_size, observations_whitelist=observations_whitelist) # load env gym_kwargs = {"action_mode": action_mode, "xml_path": train_xml_path} py_env = suite_gym.load(env_name, gym_env_wrappers=wrappers, gym_kwargs=gym_kwargs) if action_repeat > 1: py_env = wrappers.ActionRepeat(py_env, action_repeat) ### eval env # get wrappers wrappers = get_wrappers(device_id=1, model_input=model_input, render_size=render_size, observation_render_size=observation_render_size, observations_whitelist=observations_whitelist) # load env gym_kwargs = {"action_mode": action_mode, "xml_path": eval_xml_path} eval_py_env = suite_gym.load(env_name, gym_env_wrappers=wrappers, gym_kwargs=gym_kwargs) eval_py_env = video_wrapper.VideoWrapper(eval_py_env) if action_repeat > 1: eval_py_env = wrappers.ActionRepeat(eval_py_env, action_repeat) py_env.assign_tasks(train_tasks) eval_py_env.assign_tasks(eval_tasks) # set task list and reset variable to true if auto_reset_task_each_episode: py_env.wrapped_env().set_auto_reset_task(train_tasks) eval_py_env.wrapped_env().set_auto_reset_task(eval_tasks) return py_env, eval_py_env, train_tasks, eval_tasks
def test_load_disable_step_limit(self): env = suite_gym.load('CartPole-v1', max_episode_steps=0) self.assertIsInstance(env, py_environment.PyEnvironment) self.assertNotIsInstance(env, wrappers.TimeLimit)
def load_environments( universe, action_mode, env_name=None, render_size=128, observation_render_size=64, observations_whitelist=None, action_repeat=1, num_train_tasks=30, num_eval_tasks=10, eval_on_holdout_tasks=True, return_multiple_tasks=False, model_input=None, auto_reset_task_each_episode=False, ): """ Loads train and eval environments. """ assert universe == 'gym' tf.compat.v1.logging.info('Using environment {} from {} universe.'.format( env_name, universe)) is_shelf_env = (env_name == 'SawyerShelfMT-v0') or (env_name == 'SawyerShelfMT-v2') if is_shelf_env: return load_multiple_mugs_env( universe, action_mode, env_name=env_name, observations_whitelist=['state', 'pixels', 'env_info'], action_repeat=action_repeat, num_train_tasks=num_train_tasks, num_eval_tasks=num_eval_tasks, eval_on_holdout_tasks=eval_on_holdout_tasks, return_multiple_tasks=True, ) # select observation wrapper # puts either state or image into the 'pixels' location use_observation_wrapper = gym_wrappers.PixelObservationsGymWrapper if model_input is not None: if model_input == 'state': use_observation_wrapper = gym_wrappers.PixelObservationsGymWrapperState # wrappers for train env (put on GPU 0) gym_env_wrappers = [ functools.partial(gym_wrappers.RenderGymWrapper, render_kwargs={ 'height': render_size, 'width': render_size, 'device_id': 0 }), functools.partial(use_observation_wrapper, observations_whitelist=observations_whitelist, render_kwargs={ 'height': observation_render_size, 'width': observation_render_size, 'device_id': 0 }) ] # wrappers for eval env (put on GPU 1) eval_gym_env_wrappers = [ functools.partial(gym_wrappers.RenderGymWrapper, render_kwargs={ 'height': render_size, 'width': render_size, 'device_id': 1 }), # segfaults if the device is the same as train env functools.partial(use_observation_wrapper, observations_whitelist=observations_whitelist, render_kwargs={ 'height': observation_render_size, 'width': observation_render_size, 'device_id': 1 }) ] # segfaults if the device is the same as train env # create train/eval envs gym_kwargs = {"action_mode": action_mode} py_env = suite_gym.load(env_name, gym_env_wrappers=gym_env_wrappers, gym_kwargs=gym_kwargs) eval_py_env = suite_gym.load(env_name, gym_env_wrappers=eval_gym_env_wrappers, gym_kwargs=gym_kwargs) # set action mode py_env.wrapped_env().override_action_mode(action_mode) eval_py_env.wrapped_env().override_action_mode(action_mode) # video wrapper for eval saving eval_py_env = video_wrapper.VideoWrapper(eval_py_env) # action repeat if action_repeat > 1: py_env = wrappers.ActionRepeat(py_env, action_repeat) eval_py_env = wrappers.ActionRepeat(eval_py_env, action_repeat) ############################### # get possible tasks ############################### if return_multiple_tasks: # set env as being "train" or "eval" # used for defining the tasks used in the envs eval_env_is_true_eval = False if eval_on_holdout_tasks: eval_env_is_true_eval = True # train env train_tasks = py_env.init_tasks(num_tasks=num_train_tasks, is_eval_env=False) # eval env eval_tasks = eval_py_env.init_tasks(num_tasks=num_eval_tasks, is_eval_env=eval_env_is_true_eval) # set task list and reset variable to true if auto_reset_task_each_episode: py_env.wrapped_env().set_auto_reset_task(train_tasks) eval_py_env.wrapped_env().set_auto_reset_task(eval_tasks) return py_env, eval_py_env, train_tasks, eval_tasks else: return py_env, eval_py_env
def testGinConfig(self): gin.parse_config_file( test_utils.test_src_dir_path('environments/configs/suite_gym.gin')) env = suite_gym.load() self.assertIsInstance(env, py_environment.PyEnvironment) self.assertIsInstance(env, wrappers.TimeLimit)
collect_steps_per_iteration = 1 # @param replay_buffer_capacity = 100000 # @param fc_layer_params = (100,) batch_size = 64 # @param learning_rate = 1e-3 # @param log_interval = 200 # @param num_eval_episodes = 10 # @param eval_interval = 1000 # @param ### Environment #%% env = suite_gym.load(env_name) env.reset() PIL.Image.fromarray(env.render()) print('Observation Spec:') print(env.time_step_spec().observation) print('Action Spec:') print(env.action_spec()) time_step = env.reset() print('Time step:') print(time_step) action = 1 next_time_step = env.step(action) print('Next time step:') print(next_time_step)
from tf_agents.environments import tf_py_environment num_iterations = 500 # @param {type:"integer"} collect_episodes_per_iteration = 2 # @param {type:"integer"} replay_buffer_capacity = 2000 # @param {type:"integer"} fc_layer_params = (100,) learning_rate = 1e-3 # @param {type:"number"} log_interval = 25 # @param {type:"integer"} num_eval_episodes = 10 # @param {type:"integer"} eval_interval = 50 # @param {type:"integer"} #env = gym.make('qch-v0') env = suite_gym.load('qch-v0') env2 = suite_gym.load('qch-v0') train_env = tf_py_environment.TFPyEnvironment(env) eval_env = tf_py_environment.TFPyEnvironment(env2) ''' print('Observation Spec:') print(env.time_step_spec().observation) print('Observation Spec:') print(env.time_step_spec()) print('Reward Spec:')
def train_eval(root_dir, env_name="CartPole-v0", agent_class=Agent, num_iterations=10000, initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=10000, train_steps_per_iteration=1, batch_size=32): global_step = tf.compat.v1.train.get_or_create_global_step() tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_py_env = suite_gym.load(env_name) network = Network(input_tensor_spec=tf_env.time_step_spec().observation, action_spec=tf_env.action_spec()) tf_agent = agent_class(time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec(), network=network, optimizer=tf.compat.v1.train.AdamOptimizer(), epsilon_greedy=epsilon_greedy) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) replay_observer = [replay_buffer.add_batch] initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) initial_collect_op = dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=replay_observer, num_steps=initial_collect_steps).run() collect_policy = tf_agent.collect_policy collect_op = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=replay_observer, num_steps=collect_steps_per_iteration).run() dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = tf.compat.v1.data.make_initializable_iterator(dataset) experience, _ = iterator.get_next() train_op = common.function(tf_agent.train)(experience=experience) init_agent_op = tf_agent.initialize() with tf.compat.v1.Session() as sess: sess.run(iterator.initializer) common.initialize_uninitialized_variables(sess) sess.run(init_agent_op) sess.run(initial_collect_op) global_step_val = sess.run(global_step) collect_call = sess.make_callable(collect_op) global_step_call = sess.make_callable(global_step) train_step_call = sess.make_callable(train_op) for _ in range(num_iterations): collect_call() for _ in range(train_steps_per_iteration): loss_info_value, _ = train_step_call() global_step_val = global_step_call() logging.info("step = %d, loss = %d", global_step_val, loss_info_value)
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=100000, train_sequence_length=1, # Params for QNetwork fc_layer_params=(100, ), # Params for QRnnNetwork input_fc_layer_params=(50, ), lstm_size=(20, ), output_fc_layer_params=(20, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, n_step_update=1, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, use_tf_functions=True, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for checkpoints train_checkpoint_interval=10000, policy_checkpoint_interval=5000, rb_checkpoint_interval=20000, # Params for summaries and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name)) eval_tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load(env_name)) if train_sequence_length != 1 and n_step_update != 1: raise NotImplementedError( 'train_eval does not currently support n-step updates with stateful ' 'networks (i.e., RNNs)') action_spec = tf_env.action_spec() num_actions = action_spec.maximum - action_spec.minimum + 1 if train_sequence_length > 1: q_net = create_recurrent_network(input_fc_layer_params, lstm_size, output_fc_layer_params, num_actions) else: q_net = create_feedforward_network(fc_layer_params, num_actions) train_sequence_length = n_step_update # TODO(b/127301657): Decay epsilon based on global step, cf. cl/188907839 tf_agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_agent.initialize() train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=collect_steps_per_iteration) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) train_checkpointer.initialize_or_restore() rb_checkpointer.initialize_or_restore() if use_tf_functions: # To speed up collect use common.function. collect_driver.run = common.function(collect_driver.run) tf_agent.train = common.function(tf_agent.train) initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) # Collect initial replay data. logging.info( 'Initializing replay buffer by collecting experience for %d steps with ' 'a random policy.', initial_collect_steps) dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=initial_collect_steps).run() results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step.numpy()) metric_utils.log_metrics(eval_metrics) time_step = None policy_state = collect_policy.get_initial_state(tf_env.batch_size) timed_at_step = global_step.numpy() time_acc = 0 # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=train_sequence_length + 1).prefetch(3) iterator = iter(dataset) def train_step(): experience, _ = next(iterator) return tf_agent.train(experience) if use_tf_functions: train_step = common.function(train_step) for _ in range(num_iterations): start_time = time.time() time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) for _ in range(train_steps_per_iteration): train_loss = train_step() time_acc += time.time() - start_time if global_step.numpy() % log_interval == 0: logging.info('step = %d, loss = %f', global_step.numpy(), train_loss.loss) steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) timed_at_step = global_step.numpy() time_acc = 0 for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=train_metrics[:2]) if global_step.numpy() % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step.numpy()) if global_step.numpy() % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step.numpy()) if global_step.numpy() % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step.numpy()) if global_step.numpy() % eval_interval == 0: results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step.numpy()) metric_utils.log_metrics(eval_metrics) return train_loss
avg_return = total_return / num_episodes return avg_return.numpy()[0] if __name__ == '__main__': # En teorĂa deberia estar haceindo todo lo que pueda en la GPU no se si lo esta haciendo with tf.device("GPU:0"): env_name = 'CartPole-v0' ATEMP = 0 NAME = "ATEMP_%i" % ATEMP score_prom = None writer = tf.summary.create_file_writer("logs\\" + NAME) env = suite_gym.load(env_name) train_env = tf_py_environment.TFPyEnvironment(env) agent_net = build_actor_net(train_env.observation_spec(), train_env.action_spec()) value_net = build_value_net(train_env.observation_spec()) train_step_counter = tf.compat.v2.Variable(0) #llamo al agetne que realiza Proximal policy optimization agent = tfa.agents.ppo.ppo_agent.PPOAgent( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001), actor_net=agent_net,
def do_main(flags): encoded_chess = lambda: BoardEncoding(suite_gym.load('Chess-v0')) env = make_batched_environment(encoded_chess, flags.parallel_environments) actor = get_pendulum_net(env, **flags.flag_values_dict()) trainAgent(env, actor, **flags.flag_values_dict())
def _make_env(): # function to create a tf environment return tf_py_environment.TFPyEnvironment(suite_gym.load('CartPole-v0'))
action = env.action_space.sample( ) # your agent here (this takes random actions) observation, reward, done, info = env.step(action) env.close() ################################################################################ """ Another way to run the environment is with the use of TF agents. """ from tf_agents.environments import suite_gym from tf_agents.environments import tf_py_environment from tf_agents.policies import random_tf_policy py_env = suite_gym.load("gym_ctf:ctf-v0") env = tf_py_environment.TFPyEnvironment(py_env) # This creates a randomly initialized policy that the agent will follow. # Similar to just taking random actions in the environment. policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(), env.action_spec()) time_step = env.reset() while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) py_env.render('human') # Default for this render is rgb_array.
def train_eval( root_dir, env_name, # Training params train_sequence_length, initial_collect_steps=1000, collect_steps_per_iteration=1, num_iterations=100000, # RNN params. q_network_fn=q_lstm_network, # defaults to q_lstm_network. # Agent params epsilon_greedy=0.1, batch_size=64, learning_rate=1e-3, gamma=0.99, target_update_tau=0.05, target_update_period=5, reward_scale_factor=1.0, # Replay params reverb_port=None, replay_capacity=100000, # Others policy_save_interval=1000, eval_interval=1000, eval_episodes=10): """Trains and evaluates DQN.""" collect_env = suite_gym.load(env_name) eval_env = suite_gym.load(env_name) unused_observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = ( spec_utils.get_tensor_specs(collect_env)) train_step = train_utils.create_train_step() num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 q_net = q_network_fn(num_actions=num_actions) sequence_length = train_sequence_length + 1 agent = dqn_agent.DqnAgent( time_step_tensor_spec, action_tensor_spec, q_network=q_net, epsilon_greedy=epsilon_greedy, # n-step updates aren't supported with RNNs yet. n_step_update=1, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, train_step_counter=train_step) table_name = 'uniform_table' table = reverb.Table(table_name, max_size=replay_capacity, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1)) reverb_server = reverb.Server([table], port=reverb_port) reverb_replay = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=sequence_length, table_name=table_name, local_server=reverb_server) rb_observer = reverb_utils.ReverbTrajectorySequenceObserver( reverb_replay.py_client, table_name, sequence_length=sequence_length, stride_length=1) dataset = reverb_replay.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=sequence_length).prefetch(3) experience_dataset_fn = lambda: dataset saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR) env_step_metric = py_metrics.EnvironmentSteps() learning_triggers = [ triggers.PolicySavedModelTrigger( saved_model_dir, agent, train_step, interval=policy_save_interval, metadata_metrics={triggers.ENV_STEP_METADATA_KEY: env_step_metric}), triggers.StepPerSecondLogTrigger(train_step, interval=100), ] dqn_learner = learner.Learner(root_dir, train_step, agent, experience_dataset_fn, triggers=learning_triggers) # If we haven't trained yet make sure we collect some random samples first to # fill up the Replay Buffer with some experience. random_policy = random_py_policy.RandomPyPolicy( collect_env.time_step_spec(), collect_env.action_spec()) initial_collect_actor = actor.Actor(collect_env, random_policy, train_step, steps_per_run=initial_collect_steps, observers=[rb_observer]) logging.info('Doing initial collect.') initial_collect_actor.run() tf_collect_policy = agent.collect_policy collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy, use_tf_function=True) collect_actor = actor.Actor( collect_env, collect_policy, train_step, steps_per_run=collect_steps_per_iteration, observers=[rb_observer, env_step_metric], metrics=actor.collect_metrics(10), summary_dir=os.path.join(root_dir, learner.TRAIN_DIR), ) tf_greedy_policy = agent.policy greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy, use_tf_function=True) eval_actor = actor.Actor( eval_env, greedy_policy, train_step, episodes_per_run=eval_episodes, metrics=actor.eval_metrics(eval_episodes), summary_dir=os.path.join(root_dir, 'eval'), ) if eval_interval: logging.info('Evaluating.') eval_actor.run_and_log() logging.info('Training.') for _ in range(num_iterations): collect_actor.run() dqn_learner.run(iterations=1) if eval_interval and dqn_learner.train_step_numpy % eval_interval == 0: logging.info('Evaluating.') eval_actor.run_and_log() rb_observer.close() reverb_server.stop()
batch_size = 64 # @param learning_rate = 1e-3 # @param log_interval = 200 # @param num_eval_episodes = 10 # @param eval_interval = 1000 # @param """## Environment Environments in RL represent the task or problem that we are trying to solve. Standard environments can be easily created in TF-Agents using `suites`. We have different `suites` for loading environments from sources such as the OpenAI Gym, Atari, DM Control, etc., given a string environment name. Now let us load the CartPole environment from the OpenAI Gym suite. """ import ipdb; ipdb.set_trace() env = suite_gym.load(env_name) """We can render this environment to see how it looks. A free-swinging pole is attached to a cart. The goal is to move the cart right or left in order to keep the pole pointing up.""" #@test {"skip": true} env.reset() PIL.Image.fromarray(env.render()) """The `time_step = environment.step(action)` statement takes `action` in the environment. The `TimeStep` tuple returned contains the environment's next observation and reward for that action. The `time_step_spec()` and `action_spec()` methods in the environment return the specifications (types, shapes, bounds) of the `time_step` and `action` respectively.""" print('observation_spec():') print(env.observation_spec()) print('time_step_spec():') print(env.time_step_spec())
def __init__(self, environment: str = 'Seaquest-ram-v0', num_iterations: int = 20000, init_collect_steps: int = 1000, collect_steps_per_iteration: int = 1, replay_buffer_max_length: int = 100000, batch_size: int = 64, learning_rate: float = 0.001, log_interval: int = 200, num_eval_episodes: int = 10, eval_interval: int = 1000) -> None: # Initialize hyperparameters self.num_iterations = num_iterations self.init_collect_steps = init_collect_steps self.collect_steps_per_iteration = collect_steps_per_iteration self.replay_buffer_max_length = replay_buffer_max_length self.batch_size = batch_size self.learning_rate = learning_rate self.log_interval = log_interval self.num_eval_episodes = num_eval_episodes self.eval_interval = eval_interval # create the OpenAI Gym training/evaluation environments self.env_train = suite_gym.load(environment) self.env_eval = suite_gym.load(environment) self.train_env = tf_py_environment.TFPyEnvironment(self.env_train) self.eval_env = tf_py_environment.TFPyEnvironment(self.env_eval) # Instantiate a Deep Q Network using TensorFlow DQN Agent fc_layer_params = (100, ) self.q_net = q_network.QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), fc_layer_params=fc_layer_params) self.optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate) self.train_step_counter = tf.Variable(0) self.agent = dqn_agent.DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=self.q_net, optimizer=self.optimizer, epsilon_greedy=0.8, td_errors_loss_fn=common.element_wise_squared_loss, gamma=0.95, train_step_counter=self.train_step_counter) self.agent.initialize() self.eval_policy = self.agent.policy self.collect_policy = self.agent.collect_policy self.random_policy = random_tf_policy.RandomTFPolicy( self.train_env.time_step_spec(), self.train_env.action_spec()) # Collect initial data from training environment self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.replay_buffer_max_length) self._collect_data(self.train_env, self.random_policy, self.replay_buffer, steps=100) dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=self.batch_size, num_steps=2).prefetch(3) self.iterator = iter(dataset)