def train_eval_bomberman(root_dir, num_parallel_environments=4, summary_interval=1000): root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') ckpt_dir = os.path.join(root_dir, 'checkpoint') policy_dir = os.path.join(root_dir, 'policy') train_summary_writer = tf.summary.create_file_writer(train_dir, flush_millis=1000) train_summary_writer.set_as_default() eval_summary_writer = tf.summary.create_file_writer(eval_dir) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=10), tf_metrics.AverageEpisodeLengthMetric(buffer_size=10) ] global_step = tf.Variable(0) with tf.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [BombermanEnvironment] * num_parallel_environments)) eval_tf_env = BombermanEnvironment() optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
import os import tensorflow as tf from tf_agents.environments import tf_py_environment from adapter.bomberman_adapter import BombermanEnvironment if __name__ == '__main__': eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment(mode='base', replay=True)) time_step = eval_tf_env.reset() policy = tf.saved_model.load("policy") policy_state = policy.get_initial_state(batch_size=eval_tf_env.batch_size) while not time_step.is_last(): policy_step = policy.action(time_step, policy_state) policy_state = policy_step.state time_step = eval_tf_env.step(int(policy_step.action)) replay_files = os.listdir("replays") replay_file = sorted(replay_files)[-1] command = f"python main.py replay \"replays/{replay_file}\" --update-interval 0.03 --fps 60" os.system(command)
def main(): def compute_avg_return(environment, policy, num_episodes=10): total_return = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0] class ShowProgress: def __init__(self, total): self.counter = 0 self.total = total def __call__(self, trajectory): if not trajectory.is_boundary(): self.counter += 1 if self.counter % 100 == 0: print("\r{}/{}".format(self.counter, self.total), end="") def train_agent(n_iterations, save_each=10000, print_each=500): time_step = None policy_state = agent.collect_policy.get_initial_state( tf_env.batch_size) iterator = iter(dataset) for iteration in range(n_iterations): step = agent.train_step_counter.numpy() current_metrics = [] time_step, policy_state = collect_driver.run( time_step, policy_state) trajectories, buffer_info = next(iterator) train_loss = agent.train(trajectories) all_train_loss.append(train_loss.loss.numpy()) for i in range(len(train_metrics)): current_metrics.append(train_metrics[i].result().numpy()) all_metrics.append(current_metrics) if iteration % print_each == 0: print("\nIteration: {}, loss:{:.2f}".format( iteration, train_loss.loss.numpy())) for i in range(len(train_metrics)): print('{}: {}'.format(train_metrics[i].name, train_metrics[i].result().numpy())) if step % EVAL_INTERVAL == 0: avg_return = compute_avg_return(eval_tf_env, agent.policy, NUM_EVAL_EPISODES) print(f'Step = {step}, Average Return = {avg_return}') returns.append((step, avg_return)) if step % save_each == 0: print("Saving model") train_checkpointer.save(train_step) policy_save_handler.save("policy") with open("checkpoint/train_loss.pickle", "wb") as f: pickle.dump(all_train_loss, f) with open("checkpoint/all_metrics.pickle", "wb") as f: pickle.dump(all_metrics, f) with open("checkpoint/returns.pickle", "wb") as f: pickle.dump(returns, f) eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment()) #tf_env = tf_py_environment.TFPyEnvironment( # parallel_py_environment.ParallelPyEnvironment( # [BombermanEnvironment] * N_PARALLEL_ENVIRONMENTS # )) tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment()) q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), conv_layer_params=[(32, 3, 1), (32, 3, 1)], fc_layer_params=[128, 64, 32]) train_step = tf.Variable(0) update_period = 4 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) # todo fine tune epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=0.7, decay_steps=25000 // update_period, end_learning_rate=0.01) agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, gamma=0.99, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=10000) replay_buffer_observer = replay_buffer.add_batch train_metrics = [ tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size) ] collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=update_period) initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) initial_driver = dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=[ replay_buffer.add_batch, ShowProgress(INITIAL_COLLECT_STEPS) ], num_steps=INITIAL_COLLECT_STEPS) final_time_step, final_policy_state = initial_driver.run() dataset = replay_buffer.as_dataset(sample_batch_size=64, num_steps=2, num_parallel_calls=3).prefetch(3) agent.train = common.function(agent.train) all_train_loss = [] all_metrics = [] returns = [] checkpoint_dir = "checkpoint/" train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=train_step) # train_checkpointer.initialize_or_restore() # train_step = tf.compat.v1.train.get_global_step() policy_save_handler = policy_saver.PolicySaver(agent.policy) # training here train_agent(2000) # save at end in every case policy_save_handler.save("policy")
import os import tensorflow as tf from tf_agents.environments import tf_py_environment from adapter.bomberman_adapter import BombermanEnvironment, BombermanGame if __name__ == '__main__': eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment(mode='no_bomb', live_preview=True)) time_step = eval_tf_env.reset() policy = tf.saved_model.load("policies/policy_ppo") policy_state = policy.get_initial_state(batch_size=eval_tf_env.batch_size) while not time_step.is_last(): policy_step = policy.action(time_step, policy_state) policy_state = policy_step.state time_step = eval_tf_env.step(int(policy_step.action))
NUM_EVAL_EPISODES) print(f'Step = {step}, Average Return = {avg_return}') returns.append((step, avg_return)) if step % save_each == 0: policy_save_handler.save("policies/policy_ppo") if __name__ == '__main__': # tf_env = tf_py_environment.TFPyEnvironment( # parallel_py_environment.ParallelPyEnvironment( # [BombermanEnvironment] * N_PARALLEL_ENVIRONMENTS # )) tf_env = tf_py_environment.TFPyEnvironment( BombermanEnvironment(mode="no_bomb")) eval_tf_env = tf_py_environment.TFPyEnvironment( BombermanEnvironment(mode="no_bomb")) actor_net, value_net = create_networks(tf_env) train_step = tf.Variable(0) update_period = 4 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) # todo fine tune agent = ppo_agent.PPOAgent(tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, num_epochs=25,
import numpy as np from adapter.bomberman_adapter import BombermanEnvironment from deep_bomber.dqn import Agent from deep_bomber.agent_network import network import pickle SAVE_EACH_GAMES = 100 if __name__ == '__main__': env = BombermanEnvironment() lr = 0.01 n_games = 50000 q_net = network(lr=lr, n_actions=len(env.actions), input_dims=env.observation_shape) q_net.load_weights('pre_training/best-network.hdf5') agent = Agent(q_net=q_net, input_dims=env.observation_shape, n_actions=len(env.actions), gamma=0.99, epsilon=1.0, epsilon_dec=1e-6, mem_size=100000, batch_size=64, epsilon_end=0.01) scores = [] eps_history = [] for i in range(1, n_games + 1): done = False score = 0
policy_save_handler.save("policy") with open("checkpoint/train_loss.pickle", "wb") as f: pickle.dump(all_train_loss, f) with open("checkpoint/all_metrics.pickle", "wb") as f: pickle.dump(all_metrics, f) with open("checkpoint/returns.pickle", "wb") as f: pickle.dump(returns, f) if __name__ == '__main__': # tf_env = tf_py_environment.TFPyEnvironment( # parallel_py_environment.ParallelPyEnvironment( # [BombermanEnvironment] * N_PARALLEL_ENVIRONMENTS # )) tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment()) eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment()) q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), conv_layer_params=[(32, 3, 1), (32, 3, 1)], fc_layer_params=[128, 64, 32]) train_step = tf.Variable(0) update_period = 4 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) # todo fine tune epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, decay_steps=250000 // update_period, end_learning_rate=0.01)
import tensorflow as tf from tf_agents.environments import tf_py_environment from adapter.bomberman_adapter import BombermanEnvironment if __name__ == '__main__': eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment(replay=True)) time_step = eval_tf_env.reset() policy = tf.compat.v2.saved_model.load("policy") policy_state = policy.get_initial_state(batch_size=eval_tf_env.batch_size) while not time_step.is_last(): policy_step = policy.action(time_step, policy_state) policy_state = policy_step.state time_step = eval_tf_env.step(policy_step.action)