コード例 #1
0
class Agent:
    def __init__(self, state_size, batch_size, is_eval = False):
        self.state_size = state_size #
        self.action_size = 3
        self.buffer_size = 1000000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.inventory = []
        self.is_eval = is_eval    
        self.gamma = 0.99 
        self.tau = 0.001 
        self.actor_local = Actor(self.state_size, self.action_size) 
        self.actor_target = Actor(self.state_size, self.action_size)
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)    
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        
    def act(self, state):
        options = self.actor_local.model.predict(state) 
        self.last_state = state
        if not self.is_eval:
            return choice(range(3), p = options[0])     
        return np.argmax(options[0])
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state,done) 
        if len(self.memory) > self.batch_size:   
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)    
            self.last_state = next_state   
            
    def learn(self, experiences):               
        states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)    
        actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) 
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) 
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x = [states, actions], y = Q_targets) 
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1]) 
        self.soft_update(self.actor_local.model, self.actor_target.model)
        
    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(target_weights)
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #2
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        self.score = 0
        self.best_score = -np.inf
        self.noise_scale = 0.1

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0.0
        self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        else:
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #3
0
class AgentDDPG():
    def __init__(self, env):
        """

        :param task: (class instance) Instructions about the goal and reward
        """

        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.score = 0.0
        self.best = 0.0

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)

        # Save actor model for future use
        actor_local_model_yaml = self.actor_local.model.to_yaml()
        with open("actor_local_model.yaml", "w") as yaml_file:
            yaml_file.write(actor_local_model_yaml)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model with local model
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64  # original 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.01  # Soft update for target parameters Actor Critic with Advantage

    # Actor can reset the episode
    def reset_episode(self):
        # Your total reward goes to 0 same as your count
        self.total_reward = 0.0
        self.count = 0
        # Reset the gaussian noise
        self.noise.reset()
        # Gets a new state from the task
        state = self.env.reset()
        # Protect the state obtained from the task
        # by storing it as last state
        self.last_state = state
        # Return the state obtained from task
        return state

    # Actor interact with the environment
    def step(self, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored previous state in the replay buffer
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Check to see if you have enough to produce a batch
        # and learn from it
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            # Train the networks using the experiences
            self.learn(experiences)

        # Roll over last state action
        self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the keras model input
        state = np.reshape(state, newshape=[-1, self.state_size])
        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        action = self.actor_local.model.predict(state)[0]
        # Because we are exploring we add some noise to the
        # action vector
        return list(action + self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every memeber of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle
        actions_next = self.actor_target.model.predict_on_batch(next_states)

        # The critic evaluates the actions taking by the actor and generates the
        # Q(a,s) value of those actions. This action, state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or value function inputs is states, actions
        Q_targets_next = self.critic_target.model.predict_on_batch(
            ([next_states, actions_next]))

        # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the target Q(s,a).
        # For that we use the TD one-step Sarsa equations
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic in a supervise learning fashion.
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train the actor
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # Custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self, actor_model):
        actor_model.model.save_weights('weights.h5')
コード例 #4
0
class Agent(object):
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 max_size=10000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        n_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.sess = tf.Session()

        self.actor = Actor(alpha,
                           n_actions,
                           'Actor',
                           input_dims,
                           self.sess,
                           layer1_size,
                           layer2_size,
                           env.action_space.high,
                           self.batch_size,
                           ckpt_dir='tmp/ddpg/actor')

        self.critic = Critic(beta,
                             n_actions,
                             'Critic',
                             input_dims,
                             self.sess,
                             layer1_size,
                             layer2_size,
                             self.batch_size,
                             ckpt_dir='tmp/ddpg/critic')

        self.target_actor = Actor(alpha,
                                  n_actions,
                                  'TargetActor',
                                  input_dims,
                                  self.sess,
                                  layer1_size,
                                  layer2_size,
                                  env.action_space.high,
                                  self.batch_size,
                                  ckpt_dir='tmp/ddpg/target_actor')

        self.target_critic = Critic(beta,
                                    n_actions,
                                    'TargetCritic',
                                    input_dims,
                                    self.sess,
                                    layer1_size,
                                    layer2_size,
                                    self.batch_size,
                                    ckpt_dir='tmp/ddpg/target_critic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network_parameters(first=True)

    def update_target_network_parameters(self, first=False):
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                if first:
                    old_tau = self.tau
                    self.tau = 1.0
                    self.target_actor.sess.run(self.update_actor)
                    self.target_critic.sess.run(self.update_critic)
                    self.tau = old_tau
                else:
                    self.target_critic.sess.run(self.update_critic)
                    self.target_actor.sess.run(self.update_actor)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        # print("State[0]: ",state[0].shape)
        # print("State[1]: ",state[1].shape)
        state1 = state[0][np.newaxis, :]
        state2 = state[1][np.newaxis, :]
        state = [state1, state2]
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                state, action, reward, new_state, done = \
                                            self.memory.sample_buffer(self.batch_size)
                #target q-value(new_state) with actor's bounded action forward pass
                critic_value_ = self.target_critic.predict(
                    new_state, self.target_actor.predict(new_state))

                target = []
                for j in range(self.batch_size):
                    target.append(reward[j] +
                                  self.gamma * critic_value_[j] * done[j])

                target = np.reshape(target, (self.batch_size, 1))

                _ = self.critic.train(state, action, target)  #s_i, a_i and y_i

                # a = mu(s_i)
                a_outs = self.actor.predict(state)
                # gradients of Q w.r.t actions
                grads = self.critic.get_action_gradients(state, a_outs)

                self.actor.train(state, grads[0])

                self.update_target_network_parameters(first=True)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()
コード例 #5
0
class Agent:
    def __init__(self, state_size, batch_size, is_eval=False):
        self.state_size = state_size
        self.action_size = 3  #buy,sell,hold

        #defining replay memory size
        self.buffer_size = 1000000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.inventory = []

        #define wether or not training is going on
        self.is_eval = is_eval
        #Discount factor
        self.gamma = 0.99
        # soft update for AC model
        self.tau = 0.001

        #instantiate the local and target actor models for soft updates
        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)

        #critic model mapping state-action pairs with Q-values
        self.critic_local = Critic(self.state_size, self.action_size)

        #instantiate the local and target critic models for soft updates
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        #set target model parameter to local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

    #Returns an action given a state using policy(actor)network
    def act(self, state):
        options = self.actor_local.model.predict(
            state)  #returns probabilities of each action
        self.last_state = state
        if not self.is_eval:
            return choice(range(3), p=options[0])
        return np.argmax(options[0])

    #method to return set of actions carried out by agent at every  step of episode
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(
                self.batch_size)  #sampling random batch from memory to train
            self.learn(experiences)
            self.last_state = next_state

    def learn(self, experiences):
        #Extracting the states,actions,etc from all the experience tuples
        states = np.vstack([e.state for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, self.state_size)
        actions = np.vstack([e.action for e in experiences
                             if e is not None]).astype(np.float32).reshape(
                                 -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.float32).reshape(-1, 1)
        next_states = np.vstack([
            e.next_state for e in experiences if e is not None
        ]).astype(np.float32).reshape(-1, self.state_size)

        #Reshaping all the vectors into 3-dimensional vector to be fed into LSTM architecture
        states = np.reshape(states, (states.shape[0], states.shape[1], 1))
        next_states = np.reshape(
            next_states, (next_states.shape[0], next_states.shape[1], 1))
        rewards = np.reshape(rewards, (rewards.shape[0], rewards.shape[1], 1))
        dones = np.reshape(dones, (dones.shape[0], dones.shape[1], 1))
        actions = np.reshape(actions, (actions.shape[0], actions.shape[1], 1))

        #return a separate array for each exp and predict actions based on next states
        actions_next = self.actor_target.model.predict_on_batch(next_states)

        #Reshaping the vector
        actions_next = np.reshape(
            actions_next, (actions_next.shape[0], actions_next.shape[1], 1))

        #predict qvalues for actor o/p for the next state
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        #target the q-value to serve as label for critic model based on temporal diff
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        #fit the critic model to the time difference of the target
        Q_targets = np.reshape(Q_targets,
                               (Q_targets.shape[0], Q_targets.shape[1], 1))
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(target_weights)
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.total_reward = 0.0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #Generate the parameters in order to calculate the TD error
        next_state_predict = np.reshape(next_state, [-1, self.state_size])
        last_state_predict = np.reshape(self.last_state, [-1, self.state_size])
        action_predict = np.reshape(action, [-1, self.action_size])
        #next_state_action = np.concatenate([next_state, action])
        Q_target_next = self.critic_target.model.predict(
            [next_state_predict, action_predict])[0]
        Q_local = self.critic_local.model.predict(
            [last_state_predict, action_predict])[0]

        #Calculate the TD error in order to generate the priority value of the experience
        td_error = reward + self.gamma * Q_target_next - Q_local

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = math.tanh(td_error[0])

        self.memory.add(self.last_state, action, reward, next_state, done,
                        abs(td_error[0]))

        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences, idx_sample, is_weights = self.memory.sample_priority()
            self.learn(experiences, idx_sample, is_weights)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, test=False):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if test == False:
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        else:
            return list(action)

    def learn(self, experiences, idx_sample, is_weights):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        is_weights = is_weights.reshape(-1, 1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (
            1 - dones) * is_weights
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #Generate the new TD error value and update the priority value within the Replay Buffer
        td_error = rewards + self.gamma * Q_targets_next * (1 -
                                                            dones) - Q_targets

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = np.tanh(td_error)

        self.memory.update_priority(idx=idx_sample, error=td_error)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def test_control(self, file_output='data.txt'):
        state = self.reset_episode()
        done = False
        #Results with the conditions of the quadcopter
        labels = [
            'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
            'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
            'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3',
            'rotor_speed4'
        ]
        results = {x: [] for x in labels}

        # Run the simulation, and save the results.
        with open(file_output, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(labels)
            while True:
                action = self.act(state, test=True)
                #action = self.act(state, test=False)
                next_state, reward, done = self.task.step(action)
                state = next_state
                to_write = [self.task.sim.time] + list(
                    self.task.sim.pose) + list(self.task.sim.v) + list(
                        self.task.sim.angular_v) + list(action)
                for ii in range(len(labels)):
                    results[labels[ii]].append(to_write[ii])
                writer.writerow(to_write)
                if done:
                    break
        #Shows the results of the control
        control_results(results)

    #Useful for testing
    def update_score(self):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
コード例 #7
0
ファイル: agent.py プロジェクト: rflores5/Quadcopter
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # actor policy model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # critic value model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.25
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.9  # discount rate
        self.tau = 0.1  # soft update parameter

        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf

        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # keep track of rewards
        self.total_reward += reward
        self.count += 1
        # save experience/reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # if there are enough experiences, learn from them
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        # returns action for a given state(s) as per the current policy
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, experiences):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        # update the policy and value parameters given batch of experience tuples
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next state and Q values from target models
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # compute Q targets for current state and train local critic model
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train local actor model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom train function

        # soft update target models
        self.soft_update(self.actor_local.model, self.actor_target.model)
        self.soft_update(self.critic_local.model, self.critic_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #8
0
actor = Actor(env.action_space, env.observation_space)
critic = Critic(env.action_space, env.observation_space, actor.sess)
for ep in range(1000):
    # batch train
    total_reward = 0
    env.reset()
    action = env.action_space.sample()
    state, reward, done, _ = env.step(action)
    for _ in range(1000):
        # training
        states, actions, rewards, next_states = memory.sample(20)
        next_actions = actor.get_actions(next_states)
        next_qs = critic.get_qs(next_states, next_actions)
        loss, q = critic.train(states, actions, rewards, next_qs)
        action_gradients = critic.get_action_gradients(states, actions)
        actor.train(states, action_gradients[0])

        env.render()
        action = actor.get_action_for_train(state, ep)
        next_state, reward, done, _ = env.step(action)
        memory.add((state, action, reward, next_state))
        # print(state, action, reward, next_state)
        total_reward += reward
        # print(action, reward, total_reward)
        state = next_state
        if done:
            break
    # if ep % 10 == 0:
    # critic.update_network_params()
    logging.info('Episode: {}'.format(ep) +
コード例 #9
0
class PolicySearch_Agent():
    def __init__(self, task):
        self.task=task
        self.state_size=task.state_size
        self.action_size=task.action_size
        self.action_low=task.action_low
        self.action_high=task.action_high

        self.actor_local=Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target=Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.critic_local=Critic(self.state_size, self.action_size)
        self.critic_target=Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.mu=0
        self.theta=0.2 
        self.sigma=0.005 # random noise
        self.noise=Noise(self.action_size, self.mu, self.theta, self.sigma)
        self.gamma=0.9 
        self.tau=0.1 
        self.best_score=-np.inf
        self.score=0
        
        self.buffer_size=100000
        self.batch_size=64
        self.memory=ReplayBuffer(self.buffer_size, self.batch_size)

    def reset_episode(self):
        self.noise.reset()
        state=self.task.reset()
        self.last_state=state
        self.score=0
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            experiences=self.memory.sample()
            self.learn(experiences)
        self.last_state=next_state
        self.score+=reward
        if done:
            if self.score > self.best_score:
                self.best_score=self.score

    def act(self, states):
        state=np.reshape(states, [-1, self.state_size])
        action=self.actor_local.model.predict(state)[0]
        return list(action+self.noise.sample())  

    def learn(self, experiences):
        states=np.vstack([e.state for e in experiences if e is not None])
        actions=np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards=np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones=np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states=np.vstack([e.next_state for e in experiences if e is not None])
        actions_next=self.actor_target.model.predict_on_batch(next_states)
        Q_values_next=self.critic_target.model.predict_on_batch([next_states, actions_next])
        Q_values=rewards+self.gamma*Q_values_next*(1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_values)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1]) 
        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)

    def update(self, local_model, target_model):
        local_weights=np.array(local_model.get_weights())
        target_weights=np.array(target_model.get_weights())
        assert len(local_weights)==len(target_weights)
        new_weights=self.tau*local_weights+(1-self.tau)*target_weights
        target_model.set_weights(new_weights)