def __init__(self, cfg):
        # Replay memory
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Environment configuration

        self.action_shape = cfg['env']['action_shape']

        # Algorithm parameters
        self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise']
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'],
                           self.tau, self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic'])

        # Flag & Counter
        self.add_noise = True
        self.episode = 0
        self.max_episode_explore = 100
Esempio n. 2
0
 def __init__(self, task):
     self.task = task
     self.state_size = task.state_size
     self.action_size = task.action_size
     self.action_low = task.action_low
     self.action_high = task.action_high
     
     self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
     self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
     
     self.critic_local = Critic(self.state_size, self.action_size)
     self.critic_target = Critic(self.state_size, self.action_size)
     
     self.critic_target.model.set_weights(self.critic_local.model.get_weights())
     self.actor_target.model.set_weights(self.actor_local.model.get_weights())
     
     self.exploration_mu = 0
     self.exploration_theta = 0.15
     self.exploration_sigma = 0.2
     self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
     
     self.buffer_size = 1000000
     self.batch_size = 64
     self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
     
     self.gamma = 0.99
     self.tau = 0.001
Esempio n. 3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Esempio n. 4
0
    def __init__(self, task, sess):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Algorithm parameters
        self.gamma = 0.9  # discount factor
        self.tau = 2e-3  # for soft update of target parameters
        self.actor_lr = 2e-3
        self.critic_lr = 2e-3
        # END
        self.reward_variance = RunningVariance(1)
        self.q_values_variance = RunningVariance(1)

        # Actor (Policy) Model
        self.actor_local = Actor(sess, self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr)
        self.actor_target = Actor(sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.8
        self.exploration_sigma = 0.05 * self.action_range
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 100
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # initialize
        sess.run(tf.global_variables_initializer())
Esempio n. 5
0
    def __init__(self, task, verbose=False):
        self.verbose = verbose

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        #log_path = '/tmp/logs'
        #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
        #                        write_images=False, write_grads=True, write_graph=False)
        #self.callback.set_model(self.critic_local.model)

        #log_path = '/tmp/logs'
        #self.writer = tf.summary.FileWriter(log_path)

        #self.learn_counter = 0

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 512
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.015  # for soft update of target parameters
Esempio n. 6
0
    def __init__(self, task, learning_rate_actor=0.0002,
            learning_rate_critic=0.00003, gamma=0.99, tau=0.01):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 learning_rate=learning_rate_actor)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  learning_rate=learning_rate_actor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   learning_rate=learning_rate_critic)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    learning_rate=learning_rate_critic)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(size=self.action_size, mu=0, theta=0.15, sigma=0.5)

        # Replay memory
        self.batch_size = 64
        self.memory = ReplayBuffer(batch_size=self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.last_state = None

        self.reset_episode()
Esempio n. 7
0
    def reset_learning(self):
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.set_weights(self.critic_local.get_weights())
        self.actor_target.set_weights(self.actor_local.get_weights())
        self.memory = GoodBadReplayBuffer(self.buffer_size, self.batch_size)
        self.noise       = OUNoise(self.action_size, mu=self.exploration_mu,\
                                   theta=self.exploration_theta, sigma=self.exploration_sigma)
        self.noise_scale = self.noise.calc_scale()
        self.best_score = -np.inf
        self.__evaluate()
        return self.reset_episode()
Esempio n. 8
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Exploration noise
        self.exploration_mu = 0.1
        self.exploration_sigma = 0.1
        self.exploration_theta = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Experience
        self.buffer_size = 100000000
        self.batch_size = 64
        self.buffer = ReplayBuffer(self.buffer_size)

        # Parameters
        self.gamma = 0.99
        self.tau = 0.001
Esempio n. 9
0
    def __init__(self, task, prioritized_replay=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.15 #0.1
        self.exploration_sigma = 0.2  #0.2 #0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 64  # 64

        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_beta0 = 0.4
        self.prioritized_replay_beta_iters = None
        self.prioritized_replay_eps = 1e-6
        self.max_timesteps = 100000

        # Replay buffer
        if self.prioritized_replay:
            self.memory = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                self.prioritized_replay_beta_iters = self.max_timesteps
            self.beta_schedule = LinearSchedule(
                self.prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        #self.tau = 0.001 # 0.001 per paper

        self.td_errors_list = []
        self.actor_loss_list = []
        self.critic_loss_list = []
Esempio n. 10
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Exploration noise
        self.exploration_mu = 0.1
        self.exploration_sigma = 0.1
        self.exploration_theta = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Experience
        self.buffer_size = 100000000
        self.batch_size = 64
        self.buffer = ReplayBuffer(self.buffer_size)

        # Parameters
        self.gamma = 0.99
        self.tau = 0.001

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self):
        # Sample
        states, actions, rewards, dones, next_states = self.buffer.sample(
            self.batch_size, self.action_size, self.state_size)

        # Predict
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        # Train Critic
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train Actor
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Update weights
        self.update_target_weights(self.critic_local.model,
                                   self.critic_target.model)
        self.update_target_weights(self.actor_local.model,
                                   self.actor_target.model)

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.buffer.add(self.last_state, action, reward, next_state, done)
        self.learn()
        self.last_state = next_state

    def update_target_weights(self, local_model, target_model):
        target_model.set_weights(
            self.tau * np.array(local_model.get_weights()) +
            (1 - self.tau) * np.array(target_model.get_weights()))
Esempio n. 11
0
class Agent():
    def __init__(self, cfg):
        # Replay memory
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Environment configuration

        self.action_shape = cfg['env']['action_shape']

        # Algorithm parameters
        self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise']
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'],
                           self.tau, self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic'])

        # Flag & Counter
        self.add_noise = True
        self.episode = 0
        self.max_episode_explore = 100

    def init_actor_critic(self):
        # Initialize target model
        self.critic.copy_local_in_target()
        self.actor.copy_local_in_target()

    def reset(self):
        self.memory.reset_past()
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward,
                        next_state, done)
        if done:
            self.reset()


    def act(self, state):
        self.last_state = state

        window_states = self.memory.get_state_vector(state).reshape(1, -1)
        action = self.actor.predict(window_states)

        if self.add_noise and self.episode < self.max_episode_explore:
            p = self.episode / self.max_episode_explore
            action = np.clip(action*p + (1-p)*self.noise.sample(), a_max=1, a_min=-1)

        return action

    def learn(self):
        if self.memory.is_sufficient():
            experiences = self.memory.sample()

            states = experiences['state'][:, 0].reshape(self.memory.batch_size, -1)
            actions = experiences['action']
            rewards = experiences['reward']
            dones = experiences['done']
            next_states = experiences['next_state'][:, 0].reshape(self.memory.batch_size, -1)

            # get predicted next state action and Q values from target models
            actions_next = self.actor.get_targets(next_states)
            Q_targets_next = self.critic.get_targets(next_states, actions_next)

            # Compute Q targets for current states and train critic model
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            critic_summaries = self.critic.fit(states, actions, Q_targets)

            # Train actor model
            action_gradients = self.critic.get_actions_grad(states, actions)[0]
            actor_summaries = self.actor.fit(states, action_gradients)

            # Soft-update target models
            self.critic.soft_update()
            self.actor.soft_update()

            summary_reward = summary('sample_rewards', rewards)

            return critic_summaries, actor_summaries, summary_reward
Esempio n. 12
0
 def reset(self):
     self.memory.reset_past()
     self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
Esempio n. 13
0
class MyAgent:
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        
        self.gamma = 0.99
        self.tau = 0.001

        
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        self.last_state = next_state
    
    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])
    
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        
        Q_targets = rewards + self.gamma * Q_targets_next * (1-dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)
        
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
        
    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state
    

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        
        assert len(local_weights) == len(target_weights)
        
        new_weights = self.tau * local_weights + (1-self.tau) * target_weights
        target_model.set_weights(new_weights)
        
        
Esempio n. 14
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, verbose=False):
        self.verbose = verbose

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        #log_path = '/tmp/logs'
        #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
        #                        write_images=False, write_grads=True, write_graph=False)
        #self.callback.set_model(self.critic_local.model)

        #log_path = '/tmp/logs'
        #self.writer = tf.summary.FileWriter(log_path)

        #self.learn_counter = 0

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 512
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.015  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        #self.learn_counter = 0
        return state

    def mimic(self, experience_to_mimic):
        print("ready to mimic")
        self.memory.memory = experience_to_mimic

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        def save_grads(writer, model):
            for layer in model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf.summary.histogram(mapped_weight_name, weight)

                    grads = model.optimizer.get_gradients(
                        model.total_loss, weight)

                    def is_indexed_slices(grad):
                        return type(grad).__name__ == 'IndexedSlices'

                    grads = [
                        grad.values if is_indexed_slices(grad) else grad
                        for grad in grads
                    ]
                    tf.summary.histogram('{}_grad'.format(mapped_weight_name),
                                         grads)
                    merged = tf.summary.merge_all()
                    writer.flush()
                    writer.close()

        #save_grads(self.writer, self.critic_local.model)
        #def write_log(callback, names, logs, batch_no):
        #    for name, value in zip(names, logs):
        #        summary = tf.Summary()
        #        summary_value = summary.value.add()
        #        summary_value.simple_value = value
        #        summary_value.tag = name
        #        callback.writer.add_summary(summary, batch_no)
        #        callback.writer.flush()

        #train_names = ['train_loss', 'train_mae']
        #print("about to write log")
        #write_log(self.callback, train_names, logs, self.learn_counter)
        #trainable_weights = critic_local.model.trainable_weights
        #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #self.learn_counter += 1

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def _save_weight(self, model, directory_name, file_name):
        cwd = os.getcwd()
        directory_path = os.path.join(cwd, directory_name)
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

        file_path = os.path.join(directory_path, file_name)

        mv_file_to_dir_with_date(file_path, directory_path)

        model.save_weights(file_path)

    def save_weights(self, location='weights_backup'):
        if self.verbose:
            print("start save_weights")

        self._save_weight(self.critic_local.model, location, "critic_local.h5")
        self._save_weight(self.critic_target.model, location,
                          "critic_target.h5")
        self._save_weight(self.actor_local.model, location, "actor_local.h5")
        self._save_weight(self.actor_target.model, location, "actor_target.h5")

        if self.verbose:
            print("done save_weights")

    def _h5(self, model, file_path):
        if os.path.exists(file_path):
            model.load_weights(file_path)
        else:
            print(f'could not find weight to load from [{file_path}]')

    def load_weights(self, location='weights_backup'):
        if self.verbose:
            print("start load_weights")

        cwd = os.getcwd()
        directory_path = os.path.join(cwd, location)

        self._h5(self.critic_local.model,
                 os.path.join(directory_path, "critic_local.h5"))
        self._h5(self.critic_target.model,
                 os.path.join(directory_path, "critic_target.h5"))
        self._h5(self.actor_local.model,
                 os.path.join(directory_path, "actor_local.h5"))
        self._h5(self.actor_target.model,
                 os.path.join(directory_path, "actor_target.h5"))

        if self.verbose:
            print("done load_weights")
Esempio n. 15
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, sess):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Algorithm parameters
        self.gamma = 0.9  # discount factor
        self.tau = 2e-3  # for soft update of target parameters
        self.actor_lr = 2e-3
        self.critic_lr = 2e-3
        # END
        self.reward_variance = RunningVariance(1)
        self.q_values_variance = RunningVariance(1)

        # Actor (Policy) Model
        self.actor_local = Actor(sess, self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr)
        self.actor_target = Actor(sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.8
        self.exploration_sigma = 0.05 * self.action_range
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 100
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # initialize
        sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.last_reward = self.task.get_reward()
        self.reward_variance.update(self.last_reward)
        return state

    def step(self, action, reward, next_state, done):
        self.reward_variance.update(reward)
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done,
                        self.last_reward)

        # Learn, if enough samples are available in memory
        experiences = self.memory.sample()
        self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        self.last_reward = reward

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_target.model.predict(state)[0]
        noise = self.noise.sample()
        action += noise
        return list(action)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([
            self.reward_variance.normalize(e.reward) for e in experiences
            if e is not None
        ]).astype(np.float32).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        last_rewards = np.array([
            self.reward_variance.normalize(e.prev_reward) for e in experiences
            if e is not None
        ]).astype(np.float32).reshape(-1, 1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        Q_targets_prev = self.critic_target.model.predict_on_batch(
            [states, actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (
            1 - dones) - last_rewards

        for q in Q_targets:
            self.q_values_variance.update(q)
        Q_targets = np.array(
            [self.q_values_variance.normalize(q) for q in Q_targets])
        #print("\n", Q_targets, "\n")

        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        #action_gradients *= Q_targets
        #action_gradients /= self.batch_size
        #self.actor_local.train_fn([states, action_gradients, 1])  # custom training function
        self.actor_local.train(states, action_gradients)

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 16
0
class DDPG():
    """
    Reinforcement Learning agent that learns using DDPG.
    Deep DPG as described by Lillicrap et al. (2015)
    """
    def __init__(self, task, prioritized_replay=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.15 #0.1
        self.exploration_sigma = 0.2  #0.2 #0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 64  # 64

        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_beta0 = 0.4
        self.prioritized_replay_beta_iters = None
        self.prioritized_replay_eps = 1e-6
        self.max_timesteps = 100000

        # Replay buffer
        if self.prioritized_replay:
            self.memory = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                self.prioritized_replay_beta_iters = self.max_timesteps
            self.beta_schedule = LinearSchedule(
                self.prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        #self.tau = 0.001 # 0.001 per paper

        self.td_errors_list = []
        self.actor_loss_list = []
        self.critic_loss_list = []

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            if self.prioritized_replay:
                samples = self.memory.sample(self.batch_size,
                                             beta=self.beta_schedule.value(
                                                 len(self.memory)))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = samples
                experiences = []
                for i in range(len(obses_t)):
                    experiences.append(
                        namedtuple("PrioritizedExperience",
                                   field_names=[
                                       "state", "action", "reward",
                                       "next_state", "done", "weight",
                                       "batch_idx"
                                   ])(obses_t[i:i + 1], actions[i:i + 1],
                                      rewards[i:i + 1], obses_tp1[i:i + 1],
                                      dones[i:i + 1], weights[i:i + 1],
                                      batch_idxes[i:i + 1]))
                self.learn(experiences)
            else:
                experiences = self.memory.sample()
                self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        #actions = list(action + self.noise.sample())
        #print("act {}".format(actions))
        #return actions  # add some noise for exploration

        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local) using action gradients
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        if self.prioritized_replay:
            # Update replay buffer priorities
            batch_idxes = np.vstack(
                [e.batch_idx[0] for e in experiences if e is not None])
            new_priorities = np.abs(Q_targets) + self.prioritized_replay_eps
            self.memory.update_priorities(batch_idxes, new_priorities)

        self.td_errors_list.append(Q_targets.T)
        self.actor_loss_list.append(actor_loss[0])
        self.critic_loss_list.append(critic_loss)

        #print("states {} next states {} critic_loss {} actor_loss {}".format(states, actions_next, critic_loss, actor_loss))

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def save_weights(self):
        self.actor_local.model.save_weights("DDPG_actor_weights.h5")
        self.critic_local.model.save_weights("DDPG_critic_weights.h5")

    def save_td_errors(self, i_episode):
        with open("DDPG_agent_td_errors_episode_{}.csv".format(i_episode),
                  'w') as csvfile:
            writer = csv.writer(csvfile)
            for td_errors in self.td_errors_list:
                writer.writerow([td_errors])
        self.td_errors_list.clear()

    def save_losses(self, i_episode):
        with open(
                "DDPG_agent_actor_critic_loss_episode_{}.csv".format(
                    i_episode), 'w') as csvfile:
            writer = csv.writer(csvfile)
            for actor_loss, critic_loss in zip(self.actor_loss_list,
                                               self.critic_loss_list):
                writer.writerow([actor_loss, critic_loss])

        self.actor_loss_list.clear()
        self.critic_loss_list.clear()
Esempio n. 17
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.002  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 18
0
class DDPG():
    """Reinforcementing agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0005  # for soft update of target parameters
        self.noise_scale = 0.1

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.total_reward += reward

        self.memory.add(self.last_state, action, reward, next_state, done)
        #print(len(self.memory), self.batch_size)
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 19
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        # Noise process
        self.max_unsuccessful_episodes_in_a_row = 10
        self.exploration_mu = 0
        # theta units are proportional to the action range, so I set it to 1% instead of the original 15%
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.05
        # Replay memory
        self.buffer_size = 10000
        self.batch_size = 64

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters
        self.best_learning = -np.inf
        self.reset_learning()

    def __evaluate(self):
        state = self.task.reset()
        score = 0.
        count = 0
        done = False
        while not done:
            action = self.actor_local.act(state)
            state, reward, done = self.task.step(action)
            score += reward
            count += 1
        score *= self.task.action_repeat * self.task.sim.dt / self.task.sim.runtime
        self.score = score
        self.count = count
        if self.score > self.best_score:
            self.__save_best()
            return True
        return False

    def __save_best(self):
        self.__best_actor_local = self.actor_local.get_weights()
        self.__best_actor_target = self.actor_target.get_weights()
        self.__best_critic_local = self.critic_local.get_weights()
        self.__best_critic_target = self.critic_target.get_weights()
        self.count_unsuccessful_in_a_row = 0
        self.best_score = self.score
        self.best_score_count = self.count
        if self.best_score > self.best_learning:
            # save best learning
            self.__best_learning_actor_local = np.copy(self.__best_actor_local)
            self.__best_learning_actor_target = np.copy(
                self.__best_actor_target)
            self.__best_learning_critic_local = np.copy(
                self.__best_critic_local)
            self.__best_learning_critic_target = np.copy(
                self.__best_critic_target)
            self.best_learning = self.best_score
            self.best_learning_count = self.best_score_count

    def restore_best(self):
        self.actor_local.set_weights(self.__best_actor_local)
        self.actor_target.set_weights(self.__best_actor_target)
        self.critic_local.set_weights(self.__best_critic_local)
        self.critic_target.set_weights(self.__best_critic_target)
        self.count_unsuccessful_in_a_row = 0
        self.score = self.best_score
        self.count = self.best_score_count

    def restore_learning(self):
        self.actor_local.set_weights(self.__best_learning_actor_local)
        self.actor_target.set_weights(self.__best_learning_actor_target)
        self.critic_local.set_weights(self.__best_learning_critic_local)
        self.critic_target.set_weights(self.__best_learning_critic_target)
        self.count_unsuccessful_in_a_row = 0
        self.score = self.best_learning
        self.count = self.best_learning_count

    def __update_noise(self):
        if self.__evaluate():
            # improvement in score --> less exploration
            self.noise_scale = self.noise.multiply(0.5)
            self.count_unsuccessful_in_a_row = 0
            return
        self.count_unsuccessful_in_a_row += 1
        if self.count_unsuccessful_in_a_row < self.max_unsuccessful_episodes_in_a_row:
            return
        curr_noise_scale = self.noise_scale
        # increasing the action noise, more exploration
        self.noise_scale = self.noise.multiply(1.5)
        if (self.noise_scale > 1.0) and (self.noise_scale <
                                         curr_noise_scale + 1e-6):
            # could not increase action noise
            self.reset_learning()
        else:
            self.restore_best()

    def reset_learning(self):
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.set_weights(self.critic_local.get_weights())
        self.actor_target.set_weights(self.actor_local.get_weights())
        self.memory = GoodBadReplayBuffer(self.buffer_size, self.batch_size)
        self.noise       = OUNoise(self.action_size, mu=self.exploration_mu,\
                                   theta=self.exploration_theta, sigma=self.exploration_sigma)
        self.noise_scale = self.noise.calc_scale()
        self.best_score = -np.inf
        self.__evaluate()
        return self.reset_episode()

    def reset_episode(self, use_noise=True):
        self.noise.reset()
        self.use_noise = use_noise
        self.last_state = self.task.reset()
        self.__last_state_reward = self.task.curr_score
        return self.last_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        action = self.actor_local.act(state)
        if self.use_noise:
            ret = list(action +
                       self.noise.sample())  # add some noise for exploration
        else:
            ret = list(action)
        return np.clip(ret, self.task.action_low,
                       self.task.action_high).tolist()

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        if self.use_noise:
            diff_reward = reward - self.__last_state_reward
            self.memory.add(self.last_state, action, diff_reward, reward,
                            next_state, done)
            if self.memory.has_sample():
                # single learn at each step
                self.learn(self.memory.sample())
                if done:
                    self.__update_noise()

        # Roll over last state and action
        self.last_state = next_state
        self.__last_state_reward = reward

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.task.action_size)
        rewards = np.array([
            e.step_reward for e in experiences if e is not None
        ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),\
                                      (-1, self.task.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)