class Agent: def __init__(self, state_size, batch_size, is_eval = False): self.state_size = state_size # self.action_size = 3 self.buffer_size = 1000000 self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.inventory = [] self.is_eval = is_eval self.gamma = 0.99 self.tau = 0.001 self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) def act(self, state): options = self.actor_local.model.predict(state) self.last_state = state if not self.is_eval: return choice(range(3), p = options[0]) return np.argmax(options[0]) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state,done) if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x = [states, actions], y = Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters self.score = 0 self.best_score = -np.inf self.noise_scale = 0.1 def reset_episode(self): self.noise.reset() self.total_reward = 0.0 self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score self.noise_scale = max(0.5 * self.noise_scale, 0.01) else: self.noise_scale = min(2.0 * self.noise_scale, 3.2) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class AgentDDPG(): def __init__(self, env): """ :param task: (class instance) Instructions about the goal and reward """ self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high self.score = 0.0 self.best = 0.0 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Save actor model for future use actor_local_model_yaml = self.actor_local.model.to_yaml() with open("actor_local_model.yaml", "w") as yaml_file: yaml_file.write(actor_local_model_yaml) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model with local model self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 100000 self.batch_size = 64 # original 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.01 # Soft update for target parameters Actor Critic with Advantage # Actor can reset the episode def reset_episode(self): # Your total reward goes to 0 same as your count self.total_reward = 0.0 self.count = 0 # Reset the gaussian noise self.noise.reset() # Gets a new state from the task state = self.env.reset() # Protect the state obtained from the task # by storing it as last state self.last_state = state # Return the state obtained from task return state # Actor interact with the environment def step(self, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored previous state in the replay buffer self.memory.add(self.last_state, action, reward, next_state, done) # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the keras model input state = np.reshape(state, newshape=[-1, self.state_size]) # Pass the state to the actor local model to get an action # recommend for the policy in a state action = self.actor_local.model.predict(state)[0] # Because we are exploring we add some noise to the # action vector return list(action + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every memeber of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle actions_next = self.actor_target.model.predict_on_batch(next_states) # The critic evaluates the actions taking by the actor and generates the # Q(a,s) value of those actions. This action, state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or value function inputs is states, actions Q_targets_next = self.critic_target.model.predict_on_batch( ([next_states, actions_next])) # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the target Q(s,a). # For that we use the TD one-step Sarsa equations # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic in a supervise learning fashion. Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the actor action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self, actor_model): actor_model.model.save_weights('weights.h5')
class Agent(object): def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, max_size=10000, layer1_size=400, layer2_size=300, batch_size=64): n_actions = env.action_space.shape[0] self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.sess = tf.Session() self.actor = Actor(alpha, n_actions, 'Actor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/actor') self.critic = Critic(beta, n_actions, 'Critic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/critic') self.target_actor = Actor(alpha, n_actions, 'TargetActor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/target_actor') self.target_critic = Critic(beta, n_actions, 'TargetCritic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/target_critic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_actor = [ self.target_actor.params[i].assign( tf.multiply(self.actor.params[i], self.tau) + tf.multiply(self.target_actor.params[i], 1. - self.tau)) for i in range(len(self.target_actor.params)) ] self.update_critic = [ self.target_critic.params[i].assign( tf.multiply(self.critic.params[i], self.tau) + tf.multiply(self.target_critic.params[i], 1. - self.tau)) for i in range(len(self.target_critic.params)) ] self.sess.run(tf.global_variables_initializer()) self.update_target_network_parameters(first=True) def update_target_network_parameters(self, first=False): for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): if first: old_tau = self.tau self.tau = 1.0 self.target_actor.sess.run(self.update_actor) self.target_critic.sess.run(self.update_critic) self.tau = old_tau else: self.target_critic.sess.run(self.update_critic) self.target_actor.sess.run(self.update_actor) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # print("State[0]: ",state[0].shape) # print("State[1]: ",state[1].shape) state1 = state[0][np.newaxis, :] state2 = state[1][np.newaxis, :] state = [state1, state2] for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): mu = self.actor.predict(state) noise = self.noise() mu_prime = mu + noise return mu_prime[0] def learn(self): if self.memory.mem_cntr < self.batch_size: return for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) #target q-value(new_state) with actor's bounded action forward pass critic_value_ = self.target_critic.predict( new_state, self.target_actor.predict(new_state)) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = np.reshape(target, (self.batch_size, 1)) _ = self.critic.train(state, action, target) #s_i, a_i and y_i # a = mu(s_i) a_outs = self.actor.predict(state) # gradients of Q w.r.t actions grads = self.critic.get_action_gradients(state, a_outs) self.actor.train(state, grads[0]) self.update_target_network_parameters(first=True) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint()
class Agent: def __init__(self, state_size, batch_size, is_eval=False): self.state_size = state_size self.action_size = 3 #buy,sell,hold #defining replay memory size self.buffer_size = 1000000 self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.inventory = [] #define wether or not training is going on self.is_eval = is_eval #Discount factor self.gamma = 0.99 # soft update for AC model self.tau = 0.001 #instantiate the local and target actor models for soft updates self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) #critic model mapping state-action pairs with Q-values self.critic_local = Critic(self.state_size, self.action_size) #instantiate the local and target critic models for soft updates self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) #set target model parameter to local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Returns an action given a state using policy(actor)network def act(self, state): options = self.actor_local.model.predict( state) #returns probabilities of each action self.last_state = state if not self.is_eval: return choice(range(3), p=options[0]) return np.argmax(options[0]) #method to return set of actions carried out by agent at every step of episode def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample( self.batch_size) #sampling random batch from memory to train self.learn(experiences) self.last_state = next_state def learn(self, experiences): #Extracting the states,actions,etc from all the experience tuples states = np.vstack([e.state for e in experiences if e is not None ]).astype(np.float32).reshape(-1, self.state_size) actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) next_states = np.vstack([ e.next_state for e in experiences if e is not None ]).astype(np.float32).reshape(-1, self.state_size) #Reshaping all the vectors into 3-dimensional vector to be fed into LSTM architecture states = np.reshape(states, (states.shape[0], states.shape[1], 1)) next_states = np.reshape( next_states, (next_states.shape[0], next_states.shape[1], 1)) rewards = np.reshape(rewards, (rewards.shape[0], rewards.shape[1], 1)) dones = np.reshape(dones, (dones.shape[0], dones.shape[1], 1)) actions = np.reshape(actions, (actions.shape[0], actions.shape[1], 1)) #return a separate array for each exp and predict actions based on next states actions_next = self.actor_target.model.predict_on_batch(next_states) #Reshaping the vector actions_next = np.reshape( actions_next, (actions_next.shape[0], actions_next.shape[1], 1)) #predict qvalues for actor o/p for the next state Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #target the q-value to serve as label for critic model based on temporal diff Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) #fit the critic model to the time difference of the target Q_targets = np.reshape(Q_targets, (Q_targets.shape[0], Q_targets.shape[1], 1)) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, buffer_size, batch_size, gamma, tau, actor_dropout, critic_dropout, exploration_theta, exploration_sigma, actor_lr, critic_lr): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_dropout = actor_dropout self.critic_dropout = critic_dropout self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_dropout, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_dropout, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_dropout, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_dropout, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 5 self.exploration_theta = exploration_theta self.exploration_sigma = exploration_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = buffer_size self.batch_size = batch_size self.memory = PrioritizedReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0.0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #Generate the parameters in order to calculate the TD error next_state_predict = np.reshape(next_state, [-1, self.state_size]) last_state_predict = np.reshape(self.last_state, [-1, self.state_size]) action_predict = np.reshape(action, [-1, self.action_size]) #next_state_action = np.concatenate([next_state, action]) Q_target_next = self.critic_target.model.predict( [next_state_predict, action_predict])[0] Q_local = self.critic_local.model.predict( [last_state_predict, action_predict])[0] #Calculate the TD error in order to generate the priority value of the experience td_error = reward + self.gamma * Q_target_next - Q_local #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf #td_error = math.tanh(td_error[0]) self.memory.add(self.last_state, action, reward, next_state, done, abs(td_error[0])) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences, idx_sample, is_weights = self.memory.sample_priority() self.learn(experiences, idx_sample, is_weights) # Roll over last state and action self.last_state = next_state def act(self, state, test=False): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if test == False: return list(action + self.noise.sample()) # add some noise for exploration else: return list(action) def learn(self, experiences, idx_sample, is_weights): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) is_weights = is_weights.reshape(-1, 1) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1 - dones) * is_weights self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) #Generate the new TD error value and update the priority value within the Replay Buffer td_error = rewards + self.gamma * Q_targets_next * (1 - dones) - Q_targets #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf #td_error = np.tanh(td_error) self.memory.update_priority(idx=idx_sample, error=td_error) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def test_control(self, file_output='data.txt'): state = self.reset_episode() done = False #Results with the conditions of the quadcopter labels = [ 'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ] results = {x: [] for x in labels} # Run the simulation, and save the results. with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) while True: action = self.act(state, test=True) #action = self.act(state, test=False) next_state, reward, done = self.task.step(action) state = next_state to_write = [self.task.sim.time] + list( self.task.sim.pose) + list(self.task.sim.v) + list( self.task.sim.angular_v) + list(action) for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) writer.writerow(to_write) if done: break #Shows the results of the control control_results(results) #Useful for testing def update_score(self): self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # actor policy model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # critic value model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.25 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.9 # discount rate self.tau = 0.1 # soft update parameter self.total_reward = 0 self.count = 0 self.score = 0 self.best_score = -np.inf self.reset_episode() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # keep track of rewards self.total_reward += reward self.count += 1 # save experience/reward self.memory.add(self.last_state, action, reward, next_state, done) # if there are enough experiences, learn from them if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): # returns action for a given state(s) as per the current policy state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): self.score = self.total_reward / float( self.count) if self.count else 0.0 # update the policy and value parameters given batch of experience tuples states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next state and Q values from target models next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # compute Q targets for current state and train local critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train local actor model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom train function # soft update target models self.soft_update(self.actor_local.model, self.actor_target.model) self.soft_update(self.critic_local.model, self.critic_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
actor = Actor(env.action_space, env.observation_space) critic = Critic(env.action_space, env.observation_space, actor.sess) for ep in range(1000): # batch train total_reward = 0 env.reset() action = env.action_space.sample() state, reward, done, _ = env.step(action) for _ in range(1000): # training states, actions, rewards, next_states = memory.sample(20) next_actions = actor.get_actions(next_states) next_qs = critic.get_qs(next_states, next_actions) loss, q = critic.train(states, actions, rewards, next_qs) action_gradients = critic.get_action_gradients(states, actions) actor.train(states, action_gradients[0]) env.render() action = actor.get_action_for_train(state, ep) next_state, reward, done, _ = env.step(action) memory.add((state, action, reward, next_state)) # print(state, action, reward, next_state) total_reward += reward # print(action, reward, total_reward) state = next_state if done: break # if ep % 10 == 0: # critic.update_network_params() logging.info('Episode: {}'.format(ep) +
class PolicySearch_Agent(): def __init__(self, task): self.task=task self.state_size=task.state_size self.action_size=task.action_size self.action_low=task.action_low self.action_high=task.action_high self.actor_local=Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target=Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local=Critic(self.state_size, self.action_size) self.critic_target=Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.mu=0 self.theta=0.2 self.sigma=0.005 # random noise self.noise=Noise(self.action_size, self.mu, self.theta, self.sigma) self.gamma=0.9 self.tau=0.1 self.best_score=-np.inf self.score=0 self.buffer_size=100000 self.batch_size=64 self.memory=ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state=self.task.reset() self.last_state=state self.score=0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences=self.memory.sample() self.learn(experiences) self.last_state=next_state self.score+=reward if done: if self.score > self.best_score: self.best_score=self.score def act(self, states): state=np.reshape(states, [-1, self.state_size]) action=self.actor_local.model.predict(state)[0] return list(action+self.noise.sample()) def learn(self, experiences): states=np.vstack([e.state for e in experiences if e is not None]) actions=np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards=np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones=np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states=np.vstack([e.next_state for e in experiences if e is not None]) actions_next=self.actor_target.model.predict_on_batch(next_states) Q_values_next=self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_values=rewards+self.gamma*Q_values_next*(1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_values) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): local_weights=np.array(local_model.get_weights()) target_weights=np.array(target_model.get_weights()) assert len(local_weights)==len(target_weights) new_weights=self.tau*local_weights+(1-self.tau)*target_weights target_model.set_weights(new_weights)