def __init__(self, sess, env, seed, n_agent=0): print("Initialising agent %02d... \n" % n_agent) self.sess = sess self.n_agent = n_agent # Create environment if env == 'Pendulum-v0': self.env_wrapper = PendulumWrapper(env) elif env == 'LunarLanderContinuous-v2': self.env_wrapper = LunarLanderContinuousWrapper(env) elif env == 'Ant-v2': self.env_wrapper = AntWrapper(env) elif env == 'BipedalWalker-v2': self.env_wrapper = BipedalWalkerWrapper(env) elif env == 'HalfCheetah-v2': self.env_wrapper = CheetahWrapper(env) elif env == 'Reacher-v2': self.env_wrapper = ReacherWrapper(env) elif env == 'Hopper-v2': self.env_wrapper = HopperWrapper(env) elif env == 'Swimmer-v2': self.env_wrapper = SwimmerWrapper(env) elif env == 'Walker2d-v2': self.env_wrapper = Walker2dWrapper(env) elif env == 'InvertedPendulum-v2': self.env_wrapper = InvertedPendulumWrapper(env) elif env == 'Humanoid-v2': self.env_wrapper = InvertedPendulumWrapper(env) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) self.env_wrapper.set_random_seed(seed * (n_agent + 1))
class train_params: # Environment parameters ENV = 'Pendulum-v0' # Environment to use (must have low dimensional state space (i.e. not image) and continuous action space) RENDER = False # Whether or not to display the environment on the screen during training RANDOM_SEED = 99999999 # Random seed for reproducability NUM_AGENTS = 4 # Number of distributed agents to run simultaneously # Create dummy environment to get all environment params if ENV == 'Pendulum-v0': dummy_env = PendulumWrapper() elif ENV == 'LunarLanderContinuous-v2': dummy_env = LunarLanderContinuousWrapper() elif ENV == 'BipedalWalker-v2': dummy_env = BipedalWalkerWrapper() elif ENV == 'BipedalWalkerHardcore-v2': dummy_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') STATE_DIMS = dummy_env.get_state_dims() STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds() ACTION_DIMS = dummy_env.get_action_dims() ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds() V_MIN = dummy_env.v_min V_MAX = dummy_env.v_max del dummy_env # Training parameters BATCH_SIZE = 256 NUM_STEPS_TRAIN = 1000000 # Number of steps to train for MAX_EP_LENGTH = 10000 # Maximum number of steps per episode REPLAY_MEM_SIZE = 1000000 # Soft maximum capacity of replay memory REPLAY_MEM_REMOVE_STEP = 200 # Check replay memory every REPLAY_MEM_REMOVE_STEP training steps and remove samples over REPLAY_MEM_SIZE capacity PRIORITY_ALPHA = 0.6 # Controls the randomness vs prioritisation of the prioritised sampling (0.0 = Uniform sampling, 1.0 = Greedy prioritisation) PRIORITY_BETA_START = 0.4 # Starting value of beta - controls to what degree IS weights influence the gradient updates to correct for the bias introduced by priority sampling (0 - no correction, 1 - full correction) PRIORITY_BETA_END = 1.0 # Beta will be linearly annealed from its start value to this value throughout training PRIORITY_EPSILON = 0.00001 # Small value to be added to updated priorities to ensure no sample has a probability of 0 of being chosen NOISE_SCALE = 0.3 # Scaling to apply to Gaussian noise NOISE_DECAY = 0.9999 # Decay noise throughout training by scaling by noise_decay**training_step DISCOUNT_RATE = 0.99 # Discount rate (gamma) for future rewards N_STEP_RETURNS = 5 # Number of future steps to collect experiences for N-step returns UPDATE_AGENT_EP = 10 # Agent gets latest parameters from learner every update_agent_ep episodes # Network parameters CRITIC_LEARNING_RATE = 0.0001 ACTOR_LEARNING_RATE = 0.0001 CRITIC_L2_LAMBDA = 0.0 # Coefficient for L2 weight regularisation in critic - if 0, no regularisation is performed DENSE1_SIZE = 400 # Size of first hidden layer in networks DENSE2_SIZE = 300 # Size of second hidden layer in networks FINAL_LAYER_INIT = 0.003 # Initialise networks' final layer weights in range +/-final_layer_init NUM_ATOMS = 51 # Number of atoms in output layer of distributional critic TAU = 0.001 # Parameter for soft target network updates USE_BATCH_NORM = False # Whether or not to use batch normalisation in the networks # Files/Directories SAVE_CKPT_STEP = 10000 # Save checkpoint every save_ckpt_step training steps CKPT_DIR = './ckpts/' + ENV # Directory for saving/loading checkpoints CKPT_FILE = None # Checkpoint file to load and resume training from (if None, train from scratch) LOG_DIR = './logs/train/' + ENV # Directory for saving Tensorboard logs (if None, do not save logs)
def __init__(self, sess, env, seed, n_agent=0): print("Initialising agent %02d... \n" % n_agent) self.sess = sess self.n_agent = n_agent # Create environment if env == 'Pendulum-v0': self.env_wrapper = PendulumWrapper(env) elif env == 'LunarLanderContinuous-v2': self.env_wrapper = LunarLanderContinuousWrapper(env) elif env == 'BipedalWalker-v2': self.env_wrapper = BipedalWalkerWrapper(env) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.env_wrapper.set_random_seed(seed*(n_agent+1))
def __init__(self, PER_memory, run_agent_event, stop_agent_event): self.PER_memory = PER_memory self.run_agent_event = run_agent_event self.stop_agent_event = stop_agent_event if train_params.ENV == 'Pendulum-v0': self.eval_env = PendulumWrapper() elif train_params.ENV == 'LunarLanderContinuous-v2': self.eval_env = LunarLanderContinuousWrapper() elif train_params.ENV == 'BipedalWalker-v2': self.eval_env = BipedalWalkerWrapper() elif train_params.ENV == 'BipedalWalkerHardcore-v2': self.eval_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py') self.summary_writer = tf.summary.create_file_writer(train_params.LOG_DIR + '/eval/')
class play_params: ALGO = 'D4PG_2' ENV = 'BipedalWalker-v2' CKPT = '99000' # Create dummy environment to get all environment params if ENV == 'Pendulum-v0': dummy_env = PendulumWrapper() elif ENV == 'LunarLanderContinuous-v2': dummy_env = LunarLanderContinuousWrapper() elif ENV == 'BipedalWalker-v2': dummy_env = BipedalWalkerWrapper() elif ENV == 'BipedalWalkerHardcore-v2': dummy_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) STATE_DIMS = dummy_env.get_state_dims() STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds() ACTION_DIMS = dummy_env.get_action_dims() ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds() V_MIN = dummy_env.v_min V_MAX = dummy_env.v_max del dummy_env import os ACTOR_MODEL_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/actor_' + CKPT CRITIC_MODEL_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/critic_' + CKPT RECORD_DIR = os.getcwd( ) + '/data/' + ENV + '/' + ALGO + '/eval/video_' + CKPT # Play parameters NUM_EPS_PLAY = 1 # Number of episodes to play for MAX_EP_LENGTH = 10000 # Maximum number of steps per episode
def play(): if play_params.ENV == 'Pendulum-v0': play_env = PendulumWrapper() elif play_params.ENV == 'LunarLanderContinuous-v2': play_env = LunarLanderContinuousWrapper() elif play_params.ENV == 'BipedalWalker-v2': play_env = BipedalWalkerWrapper() elif play_params.ENV == 'BipedalWalkerHardcore-v2': play_env = BipedalWalkerWrapper(hardcore=True) else: raise Exception( 'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py' ) actor_net = Actor(play_params.STATE_DIMS, play_params.ACTION_DIMS, play_params.ACTION_BOUND_LOW, play_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, name='actor_play') critic_net = Critic(play_params.STATE_DIMS, play_params.ACTION_DIMS, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, train_params.NUM_ATOMS, train_params.V_MIN, train_params.V_MAX, name='critic_play') actor_net.load_weights(play_params.ACTOR_MODEL_DIR) critic_net.load_weights(play_params.CRITIC_MODEL_DIR) if not os.path.exists(play_params.RECORD_DIR): os.makedirs(play_params.RECORD_DIR) for ep in tqdm(range(1, play_params.NUM_EPS_PLAY + 1), desc='playing'): state = play_env.reset() state = play_env.normalise_state(state) step = 0 ep_done = False while not ep_done: frame = play_env.render() if play_params.RECORD_DIR is not None: filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % ( ep, step) cv2.imwrite(filepath, frame) action = actor_net(np.expand_dims(state.astype(np.float32), 0))[0] state, _, terminal = play_env.step(action) state = play_env.normalise_state(state) step += 1 # Episode can finish either by reaching terminal state or max episode steps if terminal or step == play_params.MAX_EP_LENGTH: ep_done = True # Convert saved frames to gif exit() if play_params.RECORD_DIR is not None: images = [] for file in tqdm(sorted(os.listdir(play_params.RECORD_DIR)), desc='converting to gif'): # Load image filename = play_params.RECORD_DIR + '/' + file im = cv2.imread(filename) images.append(im) # Delete static image once loaded os.remove(filename) # Save as gif print("Saving to ", play_params.RECORD_DIR) imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV, images[:-1], duration=0.01) play_env.close()