def __init__(self, state_size, action_size, agent_num, random_seed): """ Initialize an Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param random_seed (int): random seed """ # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks self.critic_local = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_target = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale=0.1)
def __init__(self, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.config = config self.seed = self.config.seed # Actor Network (w/ Target Network) self.actor_local = Actor(self.config.state_size, self.config.action_size, self.seed).to(device) self.actor_target = Actor(self.config.state_size, self.config.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.config.state_size, self.config.action_size, self.seed).to(device) self.critic_target = Critic(self.config.state_size, self.config.action_size, self.seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1.0) self.soft_update(self.actor_local, self.actor_target, 1.0) # Noise process self.noise = OUNoise(self.config.action_size, self.seed)
def __init__(self, task): self.task = task # For mountain car task self.state_size = 2 self.action_size = 1 self.action_low = task.action_space.low self.action_high = task.action_space.high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf self.best_w_actor = None self.best_w_critic = None self.score = 0
class DDPGAgent(): """ Agent that interacts with and learns from the environment. """ def __init__(self, state_size, action_size, agent_num, random_seed): """ Initialize an Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param random_seed (int): random seed """ # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks self.critic_local = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_target = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale=0.1) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor_local(obs) + noise * self.noise.sample() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor_target(obs) + noise * self.noise.sample() return action
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
""" Visualization of OU noise """ from OUnoise import OUNoise import tensorflow as tf import numpy as np import matplotlib.pyplot as plt x = 250 noise = OUNoise(size=(1,), seed=0, mu=0.1, theta=0.7, sigma=0.7) noise_list_ou = [] noise_list_tf = [] for i in range(x): noise_list_ou.append(noise.sample().clip(-0.5, 0.5)) noise_list_tf.append(tf.random.normal(shape=(1,), stddev=0.3, mean=0.0)) noise_list_ou = np.asarray(noise_list_ou).clip(-0.5, 0.5) noise_list_tf = np.asarray(noise_list_tf).clip(-0.5, 0.5) plt.style.use('seaborn') f, (ax1, ax2) = plt.subplots(1, 2) ax1.plot(noise_list_tf) ax2.plot(noise_list_ou) plt.show()
# Parameters: # =============================================================== # agent = Agent( lr_actor=0.0001, # Learning rate of actor lr_critic=0.0003, # Learning rate of critic num_actions=2, # Number of actions the agent can perform num_states=8, # Number of state inputs gamma=0.99, # Gamma coefficient / discount factor tau=0.001, # Target network update parameter delay_frequency=2, # Delay rate of actor update batch_size=64) # Batch size for networks / buffer noise_ = OUNoise( size=(1, 2), # Size of noise output - matches action seed=2, # Seed for noise mu=0, # Parameters of OU-noise theta=0.15, sigma=0.2) buffer = Buffer(buffer_size=1000000, batch_size=agent.batch_size, num_action=agent.num_actions, num_states=agent.num_states) env = gym.make('LunarLanderContinuous-v2') env.seed(88) num_episodes = 2500 # Number of episodes the agent does tf.random.set_seed(88) # Init seed for the noise start_timestep = 100 # Number of time steps the agent behaves randomly total_timestep = 0 # Defining total time step counter
class DDPG_Mountain_Car(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task # For mountain car task self.state_size = 2 self.action_size = 1 self.action_low = task.action_space.low self.action_high = task.action_space.high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf self.best_w_actor = None self.best_w_critic = None self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Keep track of score, best score, and best weights self.score += reward if self.score > self.best_score: self.best_score = self.score self.best_w_actor = self.actor_local.model.save_weights( 'actor_weights.h5') self.best_w_critic = self.critic_local.model.save_weights( 'critic_weights.h5') # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
""" Visualization of OU noise """ from OUnoise import OUNoise import numpy as np import matplotlib.pyplot as plt x = 250 noise = OUNoise(size=(1, 4), seed=0, mu=0, theta=0.3, sigma=0.4) noise_list = [] for i in range(x): noise_list.append(noise.sample()) noise_list = np.reshape(noise_list, newshape=(x, 4)) # noise_list = noise_list.clip(-1, 1) # print(np.shape(noise_list)) # print(noise_list[:, 1]) plt.style.use('seaborn') plt.plot(noise_list[:, 1]) # plt.plot(noise_list[:, 2]) plt.show()