def __init__(self, env, actor_model, critic_model, gamma=0.99, tau=1e-3, critic_lr=1e-3, actor_lr=1e-4, critic_decay=0.): # Changed this to use generic env instead of Task super().__init__(env) self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.critic_lr = critic_lr self.actor_lr = actor_lr # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, actor_model, tgt_actor_model, critic_model, tgt_critic_model, action_limits, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, tau=1e-3, gamma=0.99, process=None, rb_size=1e6, minibatch_size=64, warmup_episodes=0, episodes_trained=0, train_scores=None, test_scores=None, best_train_score=-np.inf): # Changed this to use generic env instead of Task super().__init__(warmup_episodes, episodes_trained, train_scores, test_scores, best_train_score) self.actor = Actor(actor_model, critic_model, lr=actor_lr) self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr) self.tgt_actor.set_weights(self.actor.get_weights()) self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic = Critic(tgt_critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic.set_weights(self.critic.get_weights()) self.action_limits = action_limits self.process = process self.minibatch_size = minibatch_size self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size) self.tau = tau self.gamma = gamma self.state_space = K.int_shape(critic_model.inputs[0])[1] self.action_space = K.int_shape(critic_model.inputs[1])[1] self.learning_phase = 1 if process is None: self.process = OUNoise(size=self.action_space, theta=0.15, mu=0, sigma=0.2) else: self.process = process
def __init__(self, env_reset, state_size, action_size, action_low, action_high): """Params: env_reset: callback function to reset environemnt at end of episode state_size: dimension of state space action_size: dimension of action space action_low: float - minimum action value action_high: float - maximum action value """ self.training_steps = 0 # number of training steps run so far self.env_reset = env_reset self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-3 # for soft update of target parameters self.critic_decay = 1e-2 # L2 weight decay for critic (regularization) self.critic_lr = 1e-3 # Learning rate for critic self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic self.actor_lr = 1e-4 # Learning rate for actor self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = int(1e6) self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
class DDPGAgent(Agent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, actor_model, tgt_actor_model, critic_model, tgt_critic_model, action_limits, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, tau=1e-3, gamma=0.99, process=None, rb_size=1e6, minibatch_size=64, warmup_episodes=0, episodes_trained=0, train_scores=None, test_scores=None, best_train_score=-np.inf): # Changed this to use generic env instead of Task super().__init__(warmup_episodes, episodes_trained, train_scores, test_scores, best_train_score) self.actor = Actor(actor_model, critic_model, lr=actor_lr) self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr) self.tgt_actor.set_weights(self.actor.get_weights()) self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic = Critic(tgt_critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic.set_weights(self.critic.get_weights()) self.action_limits = action_limits self.process = process self.minibatch_size = minibatch_size self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size) self.tau = tau self.gamma = gamma self.state_space = K.int_shape(critic_model.inputs[0])[1] self.action_space = K.int_shape(critic_model.inputs[1])[1] self.learning_phase = 1 if process is None: self.process = OUNoise(size=self.action_space, theta=0.15, mu=0, sigma=0.2) else: self.process = process def sense(self, s, a, r, s_new, done): s = np.reshape(s, [-1, self.state_space]) s_new = np.reshape(s_new, [-1, self.state_space]) self.buffer.add(s, a, r, s_new, done) def act(self, s): s = np.reshape(s, [-1, self.state_space]) a = self.tgt_actor(s) # Cache. self.last_state = np.copy(s) self.last_action = np.copy(a) if self.learning_phase: a += self.process.sample() a = np.clip(a, self.action_limits[0], self.action_limits[1]) self.last_action_noisy = np.copy(a) return a def new_episode(self): self.process.reset() def train_step(self): if len(self.buffer.memory) < self.minibatch_size: return minibatch = self.buffer.sample(self.minibatch_size) states = np.zeros([len(minibatch), self.state_space]) states_new = np.zeros([len(minibatch), self.state_space]) actions = np.zeros([len(minibatch), self.action_space]) r = np.zeros([len(minibatch), 1]) dones = np.zeros([len(minibatch), 1]) for i in range(len(minibatch)): states[i], actions[i], r[i], states_new[i], dones[i] = minibatch[i] # Estimate Q_values critic_out = self.critic(states_new, self.actor(states_new)) tgt_critic_out = self.tgt_critic(states_new, self.tgt_actor(states_new)) # Q-values using tgt_critic ys = r + self.gamma * tgt_critic_out # Train local critic and actor self.critic.step(states, actions, ys) self.actor.step(states) # Soft weight updates for target critic and actor critic_weights = self.critic.get_weights() tgt_critic_weights = self.tgt_critic.get_weights() for i in range(len(critic_weights)): tgt_critic_weights[i] = (1 - self.tau) * tgt_critic_weights[i] + \ self.tau * critic_weights[i] self.tgt_critic.set_weights(tgt_critic_weights) actor_weights = self.actor.get_weights() tgt_actor_weights = self.tgt_actor.get_weights() for i in range(len(actor_weights)): tgt_actor_weights[i] = (1 - self.tau) * tgt_actor_weights[i] + \ self.tau * actor_weights[i] self.tgt_actor.set_weights(tgt_actor_weights)
class DDPG(Agent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, env): # Changed this to use generic env instead of Task super().__init__(env) self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-2 # for soft update of target parameters # Critic Params self.critic_lr = 1e-3 self.critic_decay = 1e-2 # Actor Params self.actor_lr = 1e-4 self.actor_decay = 0 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_decay) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state = self.env.reset() self.last_state = state return state def step(self, action, reward, next_state, done, training=True): # Since DDPG is an off-policy learner, add a training flag # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if training and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.steps_trained += 1 # Roll over last state and action self.last_state = next_state def act(self, state, training=True): # Add a training flag to decide whether to explore """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if training: return list(action + self.noise.sample()) # add some noise for exploration else: return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def save_model(self, filename): al = self.actor_local at = self.actor_target cl = self.critic_local ct = self.critic_target self.actor_local = None self.actor_target = None self.critic_local = None self.critic_target = None with open(filename + '.ddpg_agent') as f: pickle.dump(self, f) al.save(filename + '.actor_local') at.save(filename + '.actor_target') cl.save(filename + '.critic_local') ct.save(filename + '.critic_target') self.actor_local = al self.actor_target = at self.critic_local = cl self.critic_target = ct @classmethod def load_model(cls, filename): with open(filename + '.ddpg_agent') as f: m = pickle.load(f) m.actor_local = load_model(filename + '.actor_local') m.actor_target = load_model(filename + '.actor_target') m.critic_local = load_model(filename + '.critic_local') m.critic_target = load_model(filename + '.critic_target') return m
class DDPG(): """Reinforcement Learning agent that learns using DDPG. """ def __init__(self, env_reset, state_size, action_size, action_low, action_high): """Params: env_reset: callback function to reset environemnt at end of episode state_size: dimension of state space action_size: dimension of action space action_low: float - minimum action value action_high: float - maximum action value """ self.training_steps = 0 # number of training steps run so far self.env_reset = env_reset self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-3 # for soft update of target parameters self.critic_decay = 1e-2 # L2 weight decay for critic (regularization) self.critic_lr = 1e-3 # Learning rate for critic self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic self.actor_lr = 1e-4 # Learning rate for actor self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = int(1e6) self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state = self.env_reset() self.last_state = state return state def step(self, action, reward, next_state, done, training=True): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if training and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.training_steps += 1 # Roll over last state and action self.last_state = next_state def act(self, state, training=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if training: # add some noise for exploration return list(action + self.noise.sample()) else: return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPGAgent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)