def build_network(self): # Create value (critic) network + target network if train_params.USE_BATCH_NORM: pass # for now # self.critic_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_main') # self.critic_target_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_target') else: self.critic_net = Critic(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic') self.critic_target_net = Critic(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic_target') # Create policy (actor) network + target network if train_params.USE_BATCH_NORM: pass # for now # self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_main') # self.actor_target_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_target') else: self.actor_net = Actor(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor') self.actor_target_net = Actor(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor_target')
def build_network(self): # Define input placeholders self.state_ph = tf.placeholder(tf.float32, ((train_params.BATCH_SIZE,) + train_params.STATE_DIMS)) self.action_ph = tf.placeholder(tf.float32, ((train_params.BATCH_SIZE,) + train_params.ACTION_DIMS)) self.noise_ph = tf.placeholder(tf.float32, (train_params.BATCH_SIZE,train_params.NUM_ATOMS,train_params.NOISE_DIMS)) self.action_grads_ph = tf.placeholder(tf.float32, ((train_params.BATCH_SIZE,) + train_params.ACTION_DIMS)) # Gradient of critic's value output wrt action input - for actor training self.weights_ph = tf.placeholder(tf.float32, (train_params.BATCH_SIZE)) # Batch of IS weights to weigh gradient updates based on sample priorities self.real_samples_ph = tf.placeholder(tf.float32, (train_params.BATCH_SIZE, train_params.NUM_ATOMS)) # samples of target network with Bellman update applied # Create value (critic) network + target network if train_params.USE_BATCH_NORM: self.critic_net = Critic_BN(self.state_ph, self.action_ph, self.noise_ph,train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.NOISE_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_main') self.critic_target_net = Critic_BN(self.state_ph, self.action_ph, self.noise_ph,train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.NOISE_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_target') else: self.critic_net = Critic(self.state_ph, self.action_ph, self.noise_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.NOISE_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, scope='learner_critic_main') self.critic_target_net = Critic( self.state_ph, self.action_ph, self.noise_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.NOISE_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, scope='learner_critic_target') # Create policy (actor) network + target network if train_params.USE_BATCH_NORM: self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_main') self.actor_target_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_target') else: self.actor_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope='learner_actor_main') self.actor_target_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope='learner_actor_target') # Create training step ops self.critic_train_step = self.critic_net.train_step(self.real_samples_ph, self.weights_ph, train_params.CRITIC_LEARNING_RATE, train_params.CRITIC_L2_LAMBDA, train_params.NUM_ATOMS) self.actor_train_step = self.actor_net.train_step(self.action_grads_ph, train_params.ACTOR_LEARNING_RATE, train_params.BATCH_SIZE) # Create saver for saving model ckpts (we only save learner network vars) model_name = train_params.ENV + '.ckpt' self.checkpoint_path = os.path.join(train_params.CKPT_DIR, model_name) if not os.path.exists(train_params.CKPT_DIR): os.makedirs(train_params.CKPT_DIR) saver_vars = [v for v in tf.global_variables() if 'learner' in v.name] self.saver = tf.train.Saver(var_list = saver_vars, max_to_keep=1001)
def build_network(self, training): # Input placeholder self.state_ph = tf.placeholder(tf.float32, ((None,) + train_params.STATE_DIMS)) if training: # each agent has their own var_scope var_scope = ('actor_agent_%02d'%self.n_agent) else: # when testing, var_scope comes from main learner policy (actor) network var_scope = ('learner_actor_main') # Create policy (actor) network if train_params.USE_BATCH_NORM: self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=False, scope=var_scope) self.agent_policy_params = self.actor_net.network_params + self.actor_net.bn_params else: self.actor_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope=var_scope) self.agent_policy_params = self.actor_net.network_params
def play(args): # Create environment env = gym.make(args.env) state_dims = env.observation_space.shape action_dims = env.action_space.shape action_bound_low = env.action_space.low action_bound_high = env.action_space.high # Define input placeholders state_ph = tf.placeholder(tf.float32, ((None,) + state_dims)) # Create policy (actor) network if args.use_batch_norm: actor = Actor_BN(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=False, scope='actor_main') else: actor = Actor(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, scope='actor_main') # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) print('%s restored.\n\n' % ckpt) # Create record directory if args.record_dir is not None: if not os.path.exists(args.record_dir): os.makedirs(args.record_dir) for ep in range(args.num_eps): state = env.reset() for step in range(args.max_ep_length): frame = env.render(mode='rgb_array') if args.record_dir is not None: filepath = args.record_dir + '/Ep%03d_Step%04d.jpg' % (ep, step) cv2.imwrite(filepath, frame) action = sess.run(actor.output, {state_ph:np.expand_dims(state, 0)})[0] # Add batch dimension to single state input, and remove batch dimension from single action output state, _, terminal, _ = env.step(action) if terminal: break env.close() # Convert saved frames to gif if args.record_dir is not None: images = [] for file in sorted(os.listdir(args.record_dir)): # Load image filename = args.record_dir + '/' + file im = cv2.imread(filename) images.append(im) # Delete static image once loaded os.remove(filename) # Save as gif imageio.mimsave(args.record_dir + '/%s.gif' % args.env, images, duration=0.01)
class Learner: def __init__(self, sess, PER_memory, run_agent_event, stop_agent_event): print("Initialising learner... \n\n") self.sess = sess self.PER_memory = PER_memory self.run_agent_event = run_agent_event self.stop_agent_event = stop_agent_event def build_network(self): # Define input placeholders self.state_ph = tf.placeholder( tf.float32, ((train_params.BATCH_SIZE, ) + train_params.STATE_DIMS)) self.action_ph = tf.placeholder( tf.float32, ((train_params.BATCH_SIZE, ) + train_params.ACTION_DIMS)) self.target_atoms_ph = tf.placeholder( tf.float32, (train_params.BATCH_SIZE, train_params.NUM_ATOMS )) # Atom values of target network with Bellman update applied self.target_Z_ph = tf.placeholder( tf.float32, (train_params.BATCH_SIZE, train_params.NUM_ATOMS )) # Future Z-distribution - for critic training self.action_grads_ph = tf.placeholder( tf.float32, ((train_params.BATCH_SIZE, ) + train_params.ACTION_DIMS) ) # Gradient of critic's value output wrt action input - for actor training self.weights_ph = tf.placeholder( tf.float32, (train_params.BATCH_SIZE) ) # Batch of IS weights to weigh gradient updates based on sample priorities # Create value (critic) network + target network if train_params.USE_BATCH_NORM: self.critic_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_main') self.critic_target_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_target') else: self.critic_net = Critic(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, scope='learner_critic_main') self.critic_target_net = Critic(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, scope='learner_critic_target') # Create policy (actor) network + target network if train_params.USE_BATCH_NORM: self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_main') self.actor_target_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_target') else: self.actor_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope='learner_actor_main') self.actor_target_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope='learner_actor_target') # Create training step ops self.critic_train_step = self.critic_net.train_step( self.target_Z_ph, self.target_atoms_ph, self.weights_ph, train_params.CRITIC_LEARNING_RATE, train_params.CRITIC_L2_LAMBDA) self.actor_train_step = self.actor_net.train_step( self.action_grads_ph, train_params.ACTOR_LEARNING_RATE, train_params.BATCH_SIZE) # Create saver for saving model ckpts (we only save learner network vars) model_name = train_params.ENV + '.ckpt' self.checkpoint_path = os.path.join(train_params.CKPT_DIR, model_name) if not os.path.exists(train_params.CKPT_DIR): os.makedirs(train_params.CKPT_DIR) saver_vars = [v for v in tf.global_variables() if 'learner' in v.name] self.saver = tf.train.Saver(var_list=saver_vars, max_to_keep=201) def build_update_ops(self): network_params = self.actor_net.network_params + self.critic_net.network_params target_network_params = self.actor_target_net.network_params + self.critic_target_net.network_params # Create ops which update target network params with hard copy of main network params init_update_op = [] for from_var, to_var in zip(network_params, target_network_params): init_update_op.append(to_var.assign(from_var)) # Create ops which update target network params with fraction of (tau) main network params update_op = [] for from_var, to_var in zip(network_params, target_network_params): update_op.append( to_var.assign((tf.multiply(from_var, train_params.TAU) + tf.multiply(to_var, 1. - train_params.TAU)))) self.init_update_op = init_update_op self.update_op = update_op def initialise_vars(self): # Load ckpt file if given, otherwise initialise variables and hard copy to target networks if train_params.CKPT_FILE is not None: #Restore all learner variables from ckpt ckpt = train_params.CKPT_DIR + '/' + train_params.CKPT_FILE ckpt_split = ckpt.split('-') step_str = ckpt_split[-1] self.start_step = int(step_str) self.saver.restore(self.sess, ckpt) else: self.start_step = 0 self.sess.run(tf.global_variables_initializer()) # Perform hard copy (tau=1.0) of initial params to target networks self.sess.run(self.init_update_op) def run(self): # Sample batches of experiences from replay memory and train learner networks # Initialise beta to start value priority_beta = train_params.PRIORITY_BETA_START beta_increment = ( train_params.PRIORITY_BETA_END - train_params.PRIORITY_BETA_START) / train_params.NUM_STEPS_TRAIN # Can only train when we have at least batch_size num of samples in replay memory while len(self.PER_memory) <= train_params.BATCH_SIZE: sys.stdout.write( '\rPopulating replay memory up to batch_size samples...') sys.stdout.flush() # Training sys.stdout.write('\n\nTraining...\n') sys.stdout.flush() for train_step in range(self.start_step + 1, train_params.NUM_STEPS_TRAIN + 1): # Get minibatch minibatch = self.PER_memory.sample(train_params.BATCH_SIZE, priority_beta) states_batch = minibatch[0] actions_batch = minibatch[1] rewards_batch = minibatch[2] next_states_batch = minibatch[3] terminals_batch = minibatch[4] gammas_batch = minibatch[5] weights_batch = minibatch[6] idx_batch = minibatch[7] # Critic training step # Predict actions for next states by passing next states through policy target network future_action = self.sess.run(self.actor_target_net.output, {self.state_ph: next_states_batch}) # Predict future Z distribution by passing next states and actions through value target network, also get target network's Z-atom values target_Z_dist, target_Z_atoms = self.sess.run( [ self.critic_target_net.output_probs, self.critic_target_net.z_atoms ], { self.state_ph: next_states_batch, self.action_ph: future_action }) # Create batch of target network's Z-atoms target_Z_atoms = np.repeat(np.expand_dims(target_Z_atoms, axis=0), train_params.BATCH_SIZE, axis=0) # Value of terminal states is 0 by definition target_Z_atoms[terminals_batch, :] = 0.0 # Apply Bellman update to each atom target_Z_atoms = np.expand_dims(rewards_batch, axis=1) + ( target_Z_atoms * np.expand_dims(gammas_batch, axis=1)) # Train critic TD_error, _ = self.sess.run( [self.critic_net.loss, self.critic_train_step], { self.state_ph: states_batch, self.action_ph: actions_batch, self.target_Z_ph: target_Z_dist, self.target_atoms_ph: target_Z_atoms, self.weights_ph: weights_batch }) # Use critic TD errors to update sample priorities self.PER_memory.update_priorities( idx_batch, (np.abs(TD_error) + train_params.PRIORITY_EPSILON)) # Actor training step # Get policy network's action outputs for selected states actor_actions = self.sess.run(self.actor_net.output, {self.state_ph: states_batch}) # Compute gradients of critic's value output distribution wrt actions action_grads = self.sess.run(self.critic_net.action_grads, { self.state_ph: states_batch, self.action_ph: actor_actions }) # Train actor self.sess.run(self.actor_train_step, { self.state_ph: states_batch, self.action_grads_ph: action_grads[0] }) # Update target networks self.sess.run(self.update_op) # Increment beta value at end of every step priority_beta += beta_increment # Periodically check capacity of replay mem and remove samples (by FIFO process) above this capacity if train_step % train_params.REPLAY_MEM_REMOVE_STEP == 0: if len(self.PER_memory) > train_params.REPLAY_MEM_SIZE: # Prevent agent from adding new experiences to replay memory while learner removes samples self.run_agent_event.clear() samples_to_remove = len( self.PER_memory) - train_params.REPLAY_MEM_SIZE self.PER_memory.remove(samples_to_remove) # Allow agent to continue adding experiences to replay memory self.run_agent_event.set() sys.stdout.write('\rStep {:d}/{:d}'.format( train_step, train_params.NUM_STEPS_TRAIN)) sys.stdout.flush() # Save ckpt periodically if train_step % train_params.SAVE_CKPT_STEP == 0: self.saver.save(self.sess, self.checkpoint_path, global_step=train_step) sys.stdout.write('\nCheckpoint saved.\n') sys.stdout.flush() # Stop the agents self.stop_agent_event.set()
def train(args): # Create environment env = gym.make(args.env) state_dims = env.observation_space.shape action_dims = env.action_space.shape action_bound_low = env.action_space.low action_bound_high = env.action_space.high # Set random seeds for reproducability env.seed(args.random_seed) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) # Initialise replay memory replay_mem = ReplayMemory(args, state_dims, action_dims) # Initialise Ornstein-Uhlenbeck Noise generator exploration_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dims)) noise_scaling = args.noise_scale * (action_bound_high - action_bound_low) # Define input placeholders state_ph = tf.placeholder(tf.float32, ((None, ) + state_dims)) action_ph = tf.placeholder(tf.float32, ((None, ) + action_dims)) target_ph = tf.placeholder( tf.float32, (None, 1)) # Target Q-value - for critic training action_grads_ph = tf.placeholder( tf.float32, ((None, ) + action_dims) ) # Gradient of critic's value output wrt action input - for actor training is_training_ph = tf.placeholder_with_default(True, shape=None) # Create value (critic) network + target network if args.use_batch_norm: critic = Critic_BN(state_ph, action_ph, state_dims, action_dims, args, is_training=is_training_ph, scope='critic_main') critic_target = Critic_BN(state_ph, action_ph, state_dims, action_dims, args, is_training=is_training_ph, scope='critic_target') else: critic = Critic(state_ph, action_ph, state_dims, action_dims, args, scope='critic_main') critic_target = Critic(state_ph, action_ph, state_dims, action_dims, args, scope='critic_target') # Create policy (actor) network + target network if args.use_batch_norm: actor = Actor_BN(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=is_training_ph, scope='actor_main') actor_target = Actor_BN(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=is_training_ph, scope='actor_target') else: actor = Actor(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, scope='actor_main') actor_target = Actor(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, scope='actor_target') # Create training step ops critic_train_step = critic.train_step(target_ph) actor_train_step = actor.train_step(action_grads_ph) # Create ops to update target networks update_critic_target = update_target_network(critic.network_params, critic_target.network_params, args.tau) update_actor_target = update_target_network(actor.network_params, actor_target.network_params, args.tau) # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Define saver for saving model ckpts model_name = args.env + '.ckpt' checkpoint_path = os.path.join(args.ckpt_dir, model_name) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) saver = tf.train.Saver(max_to_keep=201) # Load ckpt file if given if args.ckpt_file is not None: loader = tf.train.Saver() #Restore all variables from ckpt ckpt = args.ckpt_dir + '/' + args.ckpt_file ckpt_split = ckpt.split('-') step_str = ckpt_split[-1] start_ep = int(step_str) loader.restore(sess, ckpt) else: start_ep = 0 sess.run(tf.global_variables_initializer()) # Perform hard copy (tau=1.0) of initial params to target networks sess.run( update_target_network(critic.network_params, critic_target.network_params)) sess.run( update_target_network(actor.network_params, actor_target.network_params)) # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Create summary op to save episode reward to Tensorboard log ep_reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Episode Reward", ep_reward_var) summary_op = tf.summary.merge_all() ## Training # Initially populate replay memory by taking random actions sys.stdout.write('\nPopulating replay memory with random actions...\n') sys.stdout.flush() env.reset() for random_step in range(1, args.initial_replay_mem_size + 1): if args.render: env.render() action = env.action_space.sample() state, reward, terminal, _ = env.step(action) replay_mem.add(action, reward, state, terminal) if terminal: env.reset() sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format( random_step, args.initial_replay_mem_size)) sys.stdout.flush() sys.stdout.write('\n\nTraining...\n') sys.stdout.flush() for train_ep in range(start_ep + 1, args.num_eps_train + 1): # Reset environment and noise process state = env.reset() exploration_noise.reset() train_step = 0 episode_reward = 0 duration_values = [] ep_done = False sys.stdout.write('\n') sys.stdout.flush() while not ep_done: train_step += 1 start_time = time.time() ## Take action and store experience if args.render: env.render() if args.use_batch_norm: action = sess.run( actor.output, { state_ph: np.expand_dims(state, 0), is_training_ph: False } )[0] # Add batch dimension to single state input, and remove batch dimension from single action output else: action = sess.run(actor.output, {state_ph: np.expand_dims(state, 0)})[0] action += exploration_noise() * noise_scaling action = min(action, action_bound_high) action = max(action, action_bound_low) state, reward, terminal, _ = env.step(action) replay_mem.add(action, reward, state, terminal) episode_reward += reward ## Train networks # Get minibatch states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch( ) # Critic training step # Predict actions for next states by passing next states through policy target network future_action = sess.run(actor_target.output, {state_ph: next_states_batch}) # Predict target Q values by passing next states and actions through value target network future_Q = sess.run( critic_target.output, { state_ph: next_states_batch, action_ph: future_action } )[:, 0] # future_Q is of shape [batch_size, 1], need to remove second dimension for ops with terminals_batch and rewards_batch which are of shape [batch_size] # Q values of the terminal states is 0 by definition future_Q[terminals_batch] = 0 targets = rewards_batch + (future_Q * args.discount_rate) # Train critic sess.run( critic_train_step, { state_ph: states_batch, action_ph: actions_batch, target_ph: np.expand_dims(targets, 1) }) # Actor training step # Get policy network's action outputs for selected states actor_actions = sess.run(actor.output, {state_ph: states_batch}) # Compute gradients of critic's value output wrt actions action_grads = sess.run(critic.action_grads, { state_ph: states_batch, action_ph: actor_actions }) # Train actor sess.run(actor_train_step, { state_ph: states_batch, action_grads_ph: action_grads[0] }) # Update target networks sess.run(update_critic_target) sess.run(update_actor_target) # Display progress duration = time.time() - start_time duration_values.append(duration) ave_duration = sum(duration_values) / float(len(duration_values)) #pdb.set_trace() sys.stdout.write( '\x1b[2K\rEpisode {:d}/{:d} \t Steps = {:d} \t Reward = {:.3f} \t ({:.3f} s/step)' .format(train_ep, args.num_eps_train, train_step, episode_reward, ave_duration)) sys.stdout.flush() if terminal or train_step == args.max_ep_length: # Log total episode reward and begin next episode summary_str = sess.run(summary_op, {ep_reward_var: episode_reward}) summary_writer.add_summary(summary_str, train_ep) ep_done = True if train_ep % args.save_ckpt_step == 0: saver.save(sess, checkpoint_path, global_step=train_ep) sys.stdout.write('\n Checkpoint saved.') sys.stdout.flush() env.close()
def play(): if play_params.ENV == 'Pendulum-v0': play_env = PendulumWrapper() elif play_params.ENV == 'LunarLanderContinuous-v2': play_env = LunarLanderContinuousWrapper() elif play_params.ENV == 'BipedalWalker-v2': play_env = BipedalWalkerWrapper() elif play_params.ENV == 'BipedalWalkerHardcore-v2': play_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) actor_net = Actor(play_params.STATE_DIMS, play_params.ACTION_DIMS, play_params.ACTION_BOUND_LOW, play_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor_play') critic_net = Critic(play_params.STATE_DIMS, play_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic_play') actor_net.load_weights(play_params.ACTOR_MODEL_DIR) critic_net.load_weights(play_params.CRITIC_MODEL_DIR) if not os.path.exists(play_params.RECORD_DIR): os.makedirs(play_params.RECORD_DIR) for ep in tqdm(range(1, play_params.NUM_EPS_PLAY + 1), desc='playing'): state = play_env.reset() state = play_env.normalise_state(state) step = 0 ep_done = False while not ep_done: frame = play_env.render() if play_params.RECORD_DIR is not None: filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % ( ep, step) cv2.imwrite(filepath, frame) action = actor_net(np.expand_dims(state.astype(np.float32), 0))[0] state, _, terminal = play_env.step(action) state = play_env.normalise_state(state) step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == play_params.MAX_EP_LENGTH: ep_done = True # Convert saved frames to gif exit() if play_params.RECORD_DIR is not None: images = [] for file in tqdm(sorted(os.listdir(play_params.RECORD_DIR)), desc='converting to gif'): # Load image filename = play_params.RECORD_DIR + '/' + file im = cv2.imread(filename) images.append(im) # Delete static image once loaded os.remove(filename) # Save as gif print("Saving to ", play_params.RECORD_DIR) imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV, images[:-1], duration=0.01) play_env.close()
def test(args): # Create environment env = gym.make(args.env) state_dims = env.observation_space.shape action_dims = env.action_space.shape action_bound_low = env.action_space.low action_bound_high = env.action_space.high # Set random seeds for reproducability env.seed(args.random_seed) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) # Define input placeholder state_ph = tf.placeholder(tf.float32, ((None, ) + state_dims)) # Create policy (actor) network if args.use_batch_norm: actor = Actor_BN(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=False, scope='actor_main') else: actor = Actor(state_ph, state_dims, action_dims, action_bound_low, action_bound_high, args, scope='actor_main') # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') train_ep = ckpt_split[-1] # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Create summary op to save episode reward to Tensorboard log reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Average Test Reward", reward_var) summary_op = tf.summary.merge_all() # Start testing rewards = [] for test_ep in range(args.num_eps_test): state = env.reset() ep_reward = 0 step = 0 ep_done = False while not ep_done: if args.render: env.render() action = sess.run( actor.output, {state_ph: np.expand_dims(state, 0)} )[0] # Add batch dimension to single state input, and remove batch dimension from single action output state, reward, terminal, _ = env.step(action) ep_reward += reward step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == args.max_ep_length: sys.stdout.write('\x1b[2K\rTest episode {:d}/{:d}'.format( test_ep, args.num_eps_test)) sys.stdout.flush() rewards.append(ep_reward) ep_done = True mean_reward = np.mean(rewards) error_reward = ss.sem(rewards) sys.stdout.write( '\x1b[2K\rTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n' .format(mean_reward, error_reward)) sys.stdout.flush() # Log average episode reward for Tensorboard visualisation summary_str = sess.run(summary_op, {reward_var: mean_reward}) summary_writer.add_summary(summary_str, train_ep) # Write results to file if args.results_dir is not None: if not os.path.exists(args.results_dir): os.makedirs(args.results_dir) output_file = open(args.results_dir + '/' + args.env + '.txt', 'a') output_file.write( 'Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n' .format(train_ep, mean_reward, error_reward)) output_file.flush() sys.stdout.write('Results saved to file \n\n') sys.stdout.flush() env.close()
class Learner: def __init__(self, PER_memory, run_agent_event, stop_agent_event): self.PER_memory = PER_memory self.run_agent_event = run_agent_event self.stop_agent_event = stop_agent_event if train_params.ENV == 'Pendulum-v0': self.eval_env = PendulumWrapper() elif train_params.ENV == 'LunarLanderContinuous-v2': self.eval_env = LunarLanderContinuousWrapper() elif train_params.ENV == 'BipedalWalker-v2': self.eval_env = BipedalWalkerWrapper() elif train_params.ENV == 'BipedalWalkerHardcore-v2': self.eval_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.summary_writer = tf.summary.create_file_writer(train_params.LOG_DIR + '/eval/') def build_network(self): # Create value (critic) network + target network if train_params.USE_BATCH_NORM: pass # for now # self.critic_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_main') # self.critic_target_net = Critic_BN(self.state_ph, self.action_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, is_training=True, scope='learner_critic_target') else: self.critic_net = Critic(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic') self.critic_target_net = Critic(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic_target') # Create policy (actor) network + target network if train_params.USE_BATCH_NORM: pass # for now # self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_main') # self.actor_target_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=True, scope='learner_actor_target') else: self.actor_net = Actor(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor') self.actor_target_net = Actor(train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor_target') def target_network_update(self, tau): network_params = self.actor_net.trainable_variables + self.critic_net.trainable_variables target_network_params = self.actor_target_net.trainable_variables + self.critic_target_net.trainable_variables for from_var,to_var in zip(network_params, target_network_params): to_var.assign((tf.multiply(from_var, tau) + tf.multiply(to_var, 1. - tau))) def initialise_vars(self): # Load ckpt file if given, otherwise initialise variables and hard copy to target networks if train_params.INITIAL_ACTOR_MODEL is not None: self.actor_net.load_weights(train_params.INITIAL_ACTOR_MODEL) self.critic_net.load_weights(train_params.INITIAL_CRITIC_MODEL) else: self.start_step = 0 # Perform hard copy (tau=1.0) of initial params to target networks self.target_network_update(1.0) def run(self): # Sample batches of experiences from replay memory and train learner networks # Initialise beta to start value priority_beta = train_params.PRIORITY_BETA_START beta_increment = (train_params.PRIORITY_BETA_END - train_params.PRIORITY_BETA_START) / train_params.NUM_STEPS_TRAIN avg_return = compute_avg_return(self.eval_env, self.actor_net, train_params.MAX_EP_LENGTH) scalar_summary(self.summary_writer, "Average Return", avg_return, step=1) # Can only train when we have at least batch_size num of samples in replay memory while len(self.PER_memory) <= train_params.BATCH_SIZE: sys.stdout.write('\rPopulating replay memory up to batch_size samples...') sys.stdout.flush() t = trange(self.start_step+1, train_params.NUM_STEPS_TRAIN+1, desc='[Train]') for train_step in t: # Get minibatch minibatch = self.PER_memory.sample(train_params.BATCH_SIZE, priority_beta) states_batch = minibatch[0].astype(np.float32) actions_batch = minibatch[1].astype(np.float32) rewards_batch = minibatch[2].astype(np.float32) next_states_batch = minibatch[3].astype(np.float32) terminals_batch = minibatch[4] gammas_batch = minibatch[5].astype(np.float32) weights_batch = minibatch[6].astype(np.float32) idx_batch = minibatch[7] # ================================================================== # Critic training step # ================================================================== # Predict actions for next states by passing next states through policy target network future_action = self.actor_target_net(next_states_batch) # Predict future Z distribution by passing next states and actions through value target network, also get target network's Z-atom values _, target_Z_dist = self.critic_target_net(next_states_batch, future_action) target_Z_atoms = self.critic_target_net.z_atoms # Create batch of target network's Z-atoms target_Z_atoms = np.repeat(np.expand_dims(target_Z_atoms, axis=0), train_params.BATCH_SIZE, axis=0) # Value of terminal states is 0 by definition target_Z_atoms[terminals_batch, :] = 0.0 # Apply Bellman update to each atom target_Z_atoms = np.expand_dims(rewards_batch, axis=1) + (target_Z_atoms*np.expand_dims(gammas_batch, axis=1)) # Train critic td_error, total_loss = self.critic_net.train(states_batch, actions_batch, target_Z_atoms, target_Z_dist, weights_batch) # Use critic TD errors to update sample priorities # self.PER_memory.update_priorities(idx_batch, (np.abs(td_error.eval(session=tf.compat.v1.Session()))+train_params.PRIORITY_EPSILON)) self.PER_memory.update_priorities(idx_batch, (np.abs(td_error.numpy())+train_params.PRIORITY_EPSILON)) # ================================================================== # Actor training step # ================================================================== # Get policy network's action outputs for selected states actor_actions = self.actor_net(states_batch) action_grads = self.critic_net.get_action_grads(states_batch, actor_actions) # Train actor self.actor_net.train(states_batch, action_grads) # Update target networks self.target_network_update(train_params.TAU) actor_actions = self.actor_net(states_batch) # Increment beta value at end of every step priority_beta += beta_increment # Periodically check capacity of replay mem and remove samples (by FIFO process) above this capacity if train_step % train_params.REPLAY_MEM_REMOVE_STEP == 0: if len(self.PER_memory) > train_params.REPLAY_MEM_SIZE: # Prevent agent from adding new experiences to replay memory while learner removes samples self.run_agent_event.clear() samples_to_remove = len(self.PER_memory) - train_params.REPLAY_MEM_SIZE self.PER_memory.remove(samples_to_remove) # Allow agent to continue adding experiences to replay memory self.run_agent_event.set() if train_step % train_params.PRINTOUT_STEP == 0: t.set_description('[Train] loss={0:.4f}, avg_return={1:.2f}'.format(total_loss, avg_return)) if train_step % train_params.EVALUATE_SAVE_MODEL_STEP == 0: self.actor_net.save_weights(train_params.LOG_DIR + '/eval/actor_%d' % train_step) self.critic_net.save_weights(train_params.LOG_DIR + '/eval/critic_%d' % train_step) avg_return = compute_avg_return(self.eval_env, self.actor_net, train_params.MAX_EP_LENGTH) scalar_summary(self.summary_writer, "Average Return", avg_return, step=train_step) # Stop the agents self.stop_agent_event.set()