class train_params: # Environment parameters ENV = 'Pendulum-v0' # Environment to use (must have low dimensional state space (i.e. not image) and continuous action space) RENDER = False # Whether or not to display the environment on the screen during training RANDOM_SEED = 99999999 # Random seed for reproducability NUM_AGENTS = 4 # Number of distributed agents to run simultaneously # Create dummy environment to get all environment params if ENV == 'Pendulum-v0': dummy_env = PendulumWrapper() elif ENV == 'LunarLanderContinuous-v2': dummy_env = LunarLanderContinuousWrapper() elif ENV == 'BipedalWalker-v2': dummy_env = BipedalWalkerWrapper() elif ENV == 'BipedalWalkerHardcore-v2': dummy_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') STATE_DIMS = dummy_env.get_state_dims() STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds() ACTION_DIMS = dummy_env.get_action_dims() ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds() V_MIN = dummy_env.v_min V_MAX = dummy_env.v_max del dummy_env # Training parameters BATCH_SIZE = 256 NUM_STEPS_TRAIN = 1000000 # Number of steps to train for MAX_EP_LENGTH = 10000 # Maximum number of steps per episode REPLAY_MEM_SIZE = 1000000 # Soft maximum capacity of replay memory REPLAY_MEM_REMOVE_STEP = 200 # Check replay memory every REPLAY_MEM_REMOVE_STEP training steps and remove samples over REPLAY_MEM_SIZE capacity PRIORITY_ALPHA = 0.6 # Controls the randomness vs prioritisation of the prioritised sampling (0.0 = Uniform sampling, 1.0 = Greedy prioritisation) PRIORITY_BETA_START = 0.4 # Starting value of beta - controls to what degree IS weights influence the gradient updates to correct for the bias introduced by priority sampling (0 - no correction, 1 - full correction) PRIORITY_BETA_END = 1.0 # Beta will be linearly annealed from its start value to this value throughout training PRIORITY_EPSILON = 0.00001 # Small value to be added to updated priorities to ensure no sample has a probability of 0 of being chosen NOISE_SCALE = 0.3 # Scaling to apply to Gaussian noise NOISE_DECAY = 0.9999 # Decay noise throughout training by scaling by noise_decay**training_step DISCOUNT_RATE = 0.99 # Discount rate (gamma) for future rewards N_STEP_RETURNS = 5 # Number of future steps to collect experiences for N-step returns UPDATE_AGENT_EP = 10 # Agent gets latest parameters from learner every update_agent_ep episodes # Network parameters CRITIC_LEARNING_RATE = 0.0001 ACTOR_LEARNING_RATE = 0.0001 CRITIC_L2_LAMBDA = 0.0 # Coefficient for L2 weight regularisation in critic - if 0, no regularisation is performed DENSE1_SIZE = 400 # Size of first hidden layer in networks DENSE2_SIZE = 300 # Size of second hidden layer in networks FINAL_LAYER_INIT = 0.003 # Initialise networks' final layer weights in range +/-final_layer_init NUM_ATOMS = 51 # Number of atoms in output layer of distributional critic TAU = 0.001 # Parameter for soft target network updates USE_BATCH_NORM = False # Whether or not to use batch normalisation in the networks # Files/Directories SAVE_CKPT_STEP = 10000 # Save checkpoint every save_ckpt_step training steps CKPT_DIR = './ckpts/' + ENV # Directory for saving/loading checkpoints CKPT_FILE = None # Checkpoint file to load and resume training from (if None, train from scratch) LOG_DIR = './logs/train/' + ENV # Directory for saving Tensorboard logs (if None, do not save logs)
def __init__(self, sess, env, seed, n_agent=0): print("Initialising agent %02d... \n" % n_agent) self.sess = sess self.n_agent = n_agent # Create environment if env == 'Pendulum-v0': self.env_wrapper = PendulumWrapper(env) elif env == 'LunarLanderContinuous-v2': self.env_wrapper = LunarLanderContinuousWrapper(env) elif env == 'Ant-v2': self.env_wrapper = AntWrapper(env) elif env == 'BipedalWalker-v2': self.env_wrapper = BipedalWalkerWrapper(env) elif env == 'HalfCheetah-v2': self.env_wrapper = CheetahWrapper(env) elif env == 'Reacher-v2': self.env_wrapper = ReacherWrapper(env) elif env == 'Hopper-v2': self.env_wrapper = HopperWrapper(env) elif env == 'Swimmer-v2': self.env_wrapper = SwimmerWrapper(env) elif env == 'Walker2d-v2': self.env_wrapper = Walker2dWrapper(env) elif env == 'InvertedPendulum-v2': self.env_wrapper = InvertedPendulumWrapper(env) elif env == 'Humanoid-v2': self.env_wrapper = InvertedPendulumWrapper(env) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) self.env_wrapper.set_random_seed(seed * (n_agent + 1))
def __init__(self, sess, env, seed, n_agent=0): print("Initialising agent %02d... \n" % n_agent) self.sess = sess self.n_agent = n_agent # Create environment if env == 'Pendulum-v0': self.env_wrapper = PendulumWrapper(env) elif env == 'LunarLanderContinuous-v2': self.env_wrapper = LunarLanderContinuousWrapper(env) elif env == 'BipedalWalker-v2': self.env_wrapper = BipedalWalkerWrapper(env) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.env_wrapper.set_random_seed(seed*(n_agent+1))
def __init__(self, PER_memory, run_agent_event, stop_agent_event): self.PER_memory = PER_memory self.run_agent_event = run_agent_event self.stop_agent_event = stop_agent_event if train_params.ENV == 'Pendulum-v0': self.eval_env = PendulumWrapper() elif train_params.ENV == 'LunarLanderContinuous-v2': self.eval_env = LunarLanderContinuousWrapper() elif train_params.ENV == 'BipedalWalker-v2': self.eval_env = BipedalWalkerWrapper() elif train_params.ENV == 'BipedalWalkerHardcore-v2': self.eval_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.summary_writer = tf.summary.create_file_writer(train_params.LOG_DIR + '/eval/')
class play_params: ALGO = 'D4PG_2' ENV = 'BipedalWalker-v2' CKPT = '99000' # Create dummy environment to get all environment params if ENV == 'Pendulum-v0': dummy_env = PendulumWrapper() elif ENV == 'LunarLanderContinuous-v2': dummy_env = LunarLanderContinuousWrapper() elif ENV == 'BipedalWalker-v2': dummy_env = BipedalWalkerWrapper() elif ENV == 'BipedalWalkerHardcore-v2': dummy_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) STATE_DIMS = dummy_env.get_state_dims() STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds() ACTION_DIMS = dummy_env.get_action_dims() ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds() V_MIN = dummy_env.v_min V_MAX = dummy_env.v_max del dummy_env import os ACTOR_MODEL_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/actor_' + CKPT CRITIC_MODEL_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/critic_' + CKPT RECORD_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/video_' + CKPT # Play parameters NUM_EPS_PLAY = 1 # Number of episodes to play for MAX_EP_LENGTH = 10000 # Maximum number of steps per episode
class Agent: def __init__(self, sess, env, seed, n_agent=0): print("Initialising agent %02d... \n" % n_agent) self.sess = sess self.n_agent = n_agent # Create environment if env == 'Pendulum-v0': self.env_wrapper = PendulumWrapper() elif env == 'LunarLanderContinuous-v2': self.env_wrapper = LunarLanderContinuousWrapper() elif env == 'BipedalWalker-v2': self.env_wrapper = BipedalWalkerWrapper() elif env == 'BipedalWalkerHardcore-v2': self.env_wrapper = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.env_wrapper.set_random_seed(seed*(n_agent+1)) def build_network(self, training): # Input placeholder self.state_ph = tf.placeholder(tf.float32, ((None,) + train_params.STATE_DIMS)) if training: # each agent has their own var_scope var_scope = ('actor_agent_%02d'%self.n_agent) else: # when testing, var_scope comes from main learner policy (actor) network var_scope = ('learner_actor_main') # Create policy (actor) network if train_params.USE_BATCH_NORM: self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=False, scope=var_scope) self.agent_policy_params = self.actor_net.network_params + self.actor_net.bn_params else: self.actor_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope=var_scope) self.agent_policy_params = self.actor_net.network_params def build_update_op(self, learner_policy_params): # Update agent's policy network params from learner update_op = [] from_vars = learner_policy_params to_vars = self.agent_policy_params for from_var,to_var in zip(from_vars,to_vars): update_op.append(to_var.assign(from_var)) self.update_op = update_op def build_summaries(self, logdir): # Create summary writer to write summaries to disk if not os.path.exists(logdir): os.makedirs(logdir) self.summary_writer = tf.summary.FileWriter(logdir, self.sess.graph) # Create summary op to save episode reward to Tensorboard log self.ep_reward_var = tf.Variable(0.0, trainable=False, name=('ep_reward_agent_%02d'%self.n_agent)) tf.summary.scalar("Episode Reward", self.ep_reward_var) self.summary_op = tf.summary.merge_all() # Initialise reward var - this will not be initialised with the other network variables as these are copied over from the learner self.init_reward_var = tf.variables_initializer([self.ep_reward_var]) def run(self, PER_memory, gaussian_noise, run_agent_event, stop_agent_event): # Continuously run agent in environment to collect experiences and add to replay memory # Initialise deque buffer to store experiences for N-step returns self.exp_buffer = deque() # Perform initial copy of params from learner to agent self.sess.run(self.update_op) # Initialise var for logging episode reward if train_params.LOG_DIR is not None: self.sess.run(self.init_reward_var) # Initially set threading event to allow agent to run until told otherwise run_agent_event.set() num_eps = 0 while not stop_agent_event.is_set(): num_eps += 1 # Reset environment and experience buffer state = self.env_wrapper.reset() state = self.env_wrapper.normalise_state(state) self.exp_buffer.clear() num_steps = 0 episode_reward = 0 ep_done = False while not ep_done: num_steps += 1 ## Take action and store experience if train_params.RENDER: self.env_wrapper.render() action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0] # Add batch dimension to single state input, and remove batch dimension from single action output action += (gaussian_noise() * train_params.NOISE_DECAY**num_eps) next_state, reward, terminal = self.env_wrapper.step(action) episode_reward += reward next_state = self.env_wrapper.normalise_state(next_state) reward = self.env_wrapper.normalise_reward(reward) self.exp_buffer.append((state, action, reward)) # We need at least N steps in the experience buffer before we can compute Bellman rewards and add an N-step experience to replay memory if len(self.exp_buffer) >= train_params.N_STEP_RETURNS: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = train_params.DISCOUNT_RATE for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= train_params.DISCOUNT_RATE # If learner is requesting a pause (to remove samples from PER), wait before adding more samples run_agent_event.wait() PER_memory.add(state_0, action_0, discounted_reward, next_state, terminal, gamma) state = next_state if terminal or num_steps == train_params.MAX_EP_LENGTH: # Log total episode reward if train_params.LOG_DIR is not None: summary_str = self.sess.run(self.summary_op, {self.ep_reward_var: episode_reward}) self.summary_writer.add_summary(summary_str, num_eps) # Compute Bellman rewards and add experiences to replay memory for the last N-1 experiences still remaining in the experience buffer while len(self.exp_buffer) != 0: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = train_params.DISCOUNT_RATE for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= train_params.DISCOUNT_RATE # If learner is requesting a pause (to remove samples from PER), wait before adding more samples run_agent_event.wait() PER_memory.add(state_0, action_0, discounted_reward, next_state, terminal, gamma) # Start next episode ep_done = True # Update agent networks with learner params every 'update_agent_ep' episodes if num_eps % train_params.UPDATE_AGENT_EP == 0: self.sess.run(self.update_op) self.env_wrapper.close() def test(self): # Test a saved ckpt of actor network and save results to file (optional) def load_ckpt(ckpt_dir, ckpt_file): # Load ckpt given by ckpt_file, or else load latest ckpt in ckpt_dir loader = tf.train.Saver() if ckpt_file is not None: ckpt = ckpt_dir + '/' + ckpt_file else: ckpt = tf.train.latest_checkpoint(ckpt_dir) loader.restore(self.sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') self.train_ep = ckpt_split[-1] # Load ckpt from ckpt_dir load_ckpt(test_params.CKPT_DIR, test_params.CKPT_FILE) # Create Tensorboard summaries to save episode rewards if test_params.LOG_DIR is not None: self.build_summaries(test_params.LOG_DIR) rewards = [] for test_ep in range(1, test_params.NUM_EPS_TEST+1): state = self.env_wrapper.reset() state = self.env_wrapper.normalise_state(state) ep_reward = 0 step = 0 ep_done = False while not ep_done: if test_params.RENDER: self.env_wrapper.render() action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0] # Add batch dimension to single state input, and remove batch dimension from single action output state, reward, terminal = self.env_wrapper.step(action) state = self.env_wrapper.normalise_state(state) ep_reward += reward step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == test_params.MAX_EP_LENGTH: sys.stdout.write('\x1b[2K\rTest episode {:d}/{:d}'.format(test_ep, test_params.NUM_EPS_TEST)) sys.stdout.flush() rewards.append(ep_reward) ep_done = True mean_reward = np.mean(rewards) error_reward = ss.sem(rewards) sys.stdout.write('\x1b[2K\rTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.format(mean_reward, error_reward)) sys.stdout.flush() # Log average episode reward for Tensorboard visualisation if test_params.LOG_DIR is not None: summary_str = self.sess.run(self.summary_op, {self.ep_reward_var: mean_reward}) self.summary_writer.add_summary(summary_str, self.train_ep) # Write results to file if test_params.RESULTS_DIR is not None: if not os.path.exists(test_params.RESULTS_DIR): os.makedirs(test_params.RESULTS_DIR) output_file = open(test_params.RESULTS_DIR + '/' + test_params.ENV + '.txt' , 'a') output_file.write('Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.format(self.train_ep, mean_reward, error_reward)) output_file.flush() sys.stdout.write('Results saved to file \n\n') sys.stdout.flush() self.env_wrapper.close() def play(self): # Play a saved ckpt of actor network in the environment, visualise performance on screen and save a GIF (optional) def load_ckpt(ckpt_dir, ckpt_file): # Load ckpt given by ckpt_file, or else load latest ckpt in ckpt_dir loader = tf.train.Saver() if ckpt_file is not None: ckpt = ckpt_dir + '/' + ckpt_file else: ckpt = tf.train.latest_checkpoint(ckpt_dir) loader.restore(self.sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') self.train_ep = ckpt_split[-1] # Load ckpt from ckpt_dir load_ckpt(play_params.CKPT_DIR, play_params.CKPT_FILE) # Create record directory if not os.path.exists(play_params.RECORD_DIR): os.makedirs(play_params.RECORD_DIR) for ep in range(1, play_params.NUM_EPS_PLAY+1): state = self.env_wrapper.reset() state = self.env_wrapper.normalise_state(state) step = 0 ep_done = False while not ep_done: frame = self.env_wrapper.render() if play_params.RECORD_DIR is not None: filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % (ep, step) cv2.imwrite(filepath, frame) action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0] # Add batch dimension to single state input, and remove batch dimension from single action output state, _, terminal = self.env_wrapper.step(action) state = self.env_wrapper.normalise_state(state) step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == play_params.MAX_EP_LENGTH: ep_done = True # Convert saved frames to gif if play_params.RECORD_DIR is not None: images = [] for file in sorted(os.listdir(play_params.RECORD_DIR)): # Load image filename = play_params.RECORD_DIR + '/' + file im = cv2.imread(filename) images.append(im) # Delete static image once loaded os.remove(filename) # Save as gif imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV, images, duration=0.01) self.env_wrapper.close()
def play(): if play_params.ENV == 'Pendulum-v0': play_env = PendulumWrapper() elif play_params.ENV == 'LunarLanderContinuous-v2': play_env = LunarLanderContinuousWrapper() elif play_params.ENV == 'BipedalWalker-v2': play_env = BipedalWalkerWrapper() elif play_params.ENV == 'BipedalWalkerHardcore-v2': play_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) actor_net = Actor(play_params.STATE_DIMS, play_params.ACTION_DIMS, play_params.ACTION_BOUND_LOW, play_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor_play') critic_net = Critic(play_params.STATE_DIMS, play_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic_play') actor_net.load_weights(play_params.ACTOR_MODEL_DIR) critic_net.load_weights(play_params.CRITIC_MODEL_DIR) if not os.path.exists(play_params.RECORD_DIR): os.makedirs(play_params.RECORD_DIR) for ep in tqdm(range(1, play_params.NUM_EPS_PLAY + 1), desc='playing'): state = play_env.reset() state = play_env.normalise_state(state) step = 0 ep_done = False while not ep_done: frame = play_env.render() if play_params.RECORD_DIR is not None: filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % ( ep, step) cv2.imwrite(filepath, frame) action = actor_net(np.expand_dims(state.astype(np.float32), 0))[0] state, _, terminal = play_env.step(action) state = play_env.normalise_state(state) step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == play_params.MAX_EP_LENGTH: ep_done = True # Convert saved frames to gif exit() if play_params.RECORD_DIR is not None: images = [] for file in tqdm(sorted(os.listdir(play_params.RECORD_DIR)), desc='converting to gif'): # Load image filename = play_params.RECORD_DIR + '/' + file im = cv2.imread(filename) images.append(im) # Delete static image once loaded os.remove(filename) # Save as gif print("Saving to ", play_params.RECORD_DIR) imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV, images[:-1], duration=0.01) play_env.close()