Esempio n. 1
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 task,
                 explore_mu=0,
                 explore_theta=0.15,
                 explore_sigma=0.2,
                 gamma=0.99,
                 tau=0.01):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = explore_mu  # 0
        self.exploration_theta = explore_theta  # 0.15
        self.exploration_sigma = explore_sigma  # 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # 0.99  # discount factor
        self.tau = gamma  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 2
0
class agentDDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.rotor_high = task.action_high
        self.rotor_low = task.action_low

        # We will update local agent continuously and intermittantly copy the weights to target agent
        self.actor_local = actor(self.state_size,
                                 self.action_size,
                                 h1=64,
                                 h2=32,
                                 lr=0.001,
                                 r_h=self.rotor_high,
                                 r_l=self.rotor_low)
        self.actor_target = actor(self.state_size,
                                  self.action_size,
                                  h1=64,
                                  h2=32,
                                  lr=0.001,
                                  r_h=self.rotor_high,
                                  r_l=self.rotor_low)

        self.critic_local = critic(self.state_size,
                                   self.action_size,
                                   h1=32,
                                   h2=24,
                                   lr=0.001)
        self.critic_target = critic(self.state_size,
                                    self.action_size,
                                    h1=32,
                                    h2=24,
                                    lr=0.001)

        # Make the weights of both local and target agent same
        self.actor_target.actorModel.set_weights(
            self.actor_local.actorModel.get_weights())
        self.critic_target.criticModel.set_weights(
            self.critic_local.criticModel.get_weights())

        self.mu = 0
        self.sigma = 0.15
        self.theta = 0.2
        self.OUNoise = OUNoise(self.action_size, self.mu, self.sigma,
                               self.theta)

        self.bufferSize = 100000
        self.batch_size = 64
        self.memory = memoryBuffer(self.bufferSize, self.batch_size)

        self.gamma = 0.99
        self.tau = 0.01

    def reset_episode(self):
        self.OUNoise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if self.memory.len() > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def learn(self, experience):
        states = np.vstack([e.state for e in experience if e is not None])
        actions = np.array([e.action for e in experience
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experience if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experience if e is not None])
        done = np.array([e.done for e in experience
                         if e is not None]).astype(np.uint8).reshape(-1, 1)

        # Train actor agent based on the action_gradient received from critic
        # Train critic agent based on TD error
        actions_next = self.actor_local.actorModel.predict_on_batch(
            next_states)
        Q_targets_next = self.critic_local.criticModel.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)
        self.critic_local.criticModel.train_on_batch(x=[states, actions],
                                                     y=Q_targets)

        action_gradients = self.critic_local.get_action_gradients(
            inputs=[states, actions, 0])
        action_gradients = np.reshape(action_gradients, (-1, self.action_size))

        self.actor_local.train_actor(inputs=[states, action_gradients, 1])

        self.soft_update(self.actor_local.actorModel,
                         self.actor_target.actorModel)
        self.soft_update(self.critic_local.criticModel,
                         self.critic_target.criticModel)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = local_weights * self.tau + target_weights * (1 -
                                                                   self.tau)
        target_model.set_weights(new_weights)

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.actorModel.predict(state)[0]
        return list(action + self.OUNoise.sample())
Esempio n. 3
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        print('loaded DDPG ')
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        self.w = np.random.normal(size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
        scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
        
        # Episode variables
#         self.reset_episode()
        
        #load weight if existing
        
        
        

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise_scale = 0.1
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        
        if done:
            self.score_update()

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model) 
        
        
        
        
        
        
    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
        
        
    def score_update(self):
        # Learn by random policy search, using a reward-based score
        self.score = self.total_reward / float(self.count) if self.count else 0.0
        

        
        if self.score > self.best_score:
            self.best_score = self.score
            self.best_w = self.w
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        else:
            self.w = self.best_w
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions
        
Esempio n. 4
0
class DDPG():
    '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient'''
    def __init__(self, task):
        '''
        Params
        ======
        task (object)   : environment

        '''
        '''
        Reference: Continuous Control With Deep Reinforcement Learning(2016)
        Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras
        =========
        gamma   : 0.99
        tau     : 0.001
        buffer_size (ReplayBuffer)  : 1e6
        batch_size (ReplayBuffer)   : 64
        theta (Ornstein-Uhlenbeck process)  : 0.15
        sigma (Ornstein-Uhlenbeck process)  : 0.2


        '''

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # actor (policy) model - use two copies of model for updating model and producing target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # critic (value) model - use two copies of model for updating model and producing target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # reward history
        self.best_avg_score = -np.inf
        self.accumulated_reward = 0
        self.count = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.accumulated_reward = 0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # save experience and reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # roll over last state and action
        self.last_state = next_state

        # accumulate reward
        self.accumulated_reward += reward
        self.count += 1

        # record best average score
        if done:
            if float(self.accumulated_reward /
                     self.count) > self.best_avg_score:
                self.best_avg_score = float(self.accumulated_reward /
                                            self.count)

    def act(self, state):
        '''returns actions for given state(s) as per current policy'''
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration
        # both action and self.noise.sample() are numpy object, + means sum up both,
        # instead of concatenation

    def learn(self, experiences):
        '''update policy and value parameters using given batch of experience tuples'''
        # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).\
                           astype(np.uint8).reshape(-1,1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next-state actions and Q-values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # compute Q targets for current states and train critic model (local)
        # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)²
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train actor model (local)
        # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function
        # The learning phase flag is a bool tensor (0 = test, 1 = train)
        # to be passed as input to any Keras function
        # that uses a different behavior at train time and test time.

        # soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        '''soft update model parameters'''
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            'Local and target model parameters must have the same size'

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class DDPG_Agent:
    def __init__(self, task, noise, memory, rl_param, nn_hidden, actor_lr,
                 critic_lr, q_lambda):
        # Adapted for this gym
        self.task = task
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.state_space = task.state_size
        self.action_space = task.action_size
        self.q_lambda = q_lambda

        # Instantiate Actors and Critics.
        self.actor = Actor(self.state_space,
                           self.action_space,
                           self.action_low,
                           self.action_high,
                           hidden_units=nn_hidden[0],
                           learning_rate=actor_lr,
                           q_lambda=q_lambda)
        self.actor_target = Actor(self.state_space,
                                  self.action_space,
                                  self.action_low,
                                  self.action_high,
                                  hidden_units=nn_hidden[0],
                                  learning_rate=actor_lr,
                                  q_lambda=q_lambda)

        self.critic = Critic(self.state_space,
                             self.action_space,
                             hidden_units=nn_hidden[1],
                             learning_rate=critic_lr,
                             q_lambda=q_lambda)
        self.critic_target = Critic(self.state_space,
                                    self.action_space,
                                    hidden_units=nn_hidden[1],
                                    learning_rate=critic_lr,
                                    q_lambda=q_lambda)

        # Set same weights in target.
        self.actor_target.model.set_weights(self.actor.model.get_weights())
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        # Noise for exploration.
        self.mean = noise[0]
        self.sigma = noise[1]
        self.theta = noise[2]
        self.ounoise = OUNoise(self.action_space, self.mean, self.sigma,
                               self.theta)

        # Experience Replay memory.
        self.capacity = memory[0]
        self.batch_size = memory[1]
        self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity,
                                                batch_size=self.batch_size)

        # RL parameters.
        self.gamma = rl_param[0]
        self.t = rl_param[1]

        # Keeping track of learning.
        self.learning_rewards = list()
        self.total_reward = None
        self.best_reward = -np.inf
        self.losses = list()

    def restart_task(self):
        if self.total_reward is not None:
            self.learning_rewards.append(self.total_reward)
            if self.total_reward > self.best_reward:
                self.best_reward = self.total_reward
        self.total_reward = 0
        state = self.task.reset()
        self.state = state
        self.ounoise.restart()
        return state

    def act(self, state, epsilon):
        self.action_wo_noise = self.actor.model.predict(
            np.reshape(state, newshape=(-1, self.state_space)))
        self.step_noise = self.ounoise.sample() * epsilon
        action = np.array(self.action_wo_noise[0] +
                          self.step_noise[0]).reshape(-1, self.action_space)
        action_clipped = np.clip(a=action,
                                 a_min=self.action_low,
                                 a_max=self.action_high)
        return action_clipped

    # Saves expirience into memory and updates actor-critic weights.
    def store_learn(self, state, action, reward, done, next_state):

        # Store experience into exp replay memory.
        self.er_buffer.add_env_reaction(
            (state, action, reward, done, next_state))

        # Learn if agent has enough experiences.
        if len(self.er_buffer.mem) > self.batch_size:
            self.learn()

        self.total_reward += reward
        # Update to the current state of the enviroment.
        self.state = next_state

    def soft_update(self):
        actor_current = np.array(self.actor.model.get_weights())
        critic_current = np.array(self.critic.model.get_weights())

        actor_target = np.array(self.actor_target.model.get_weights())
        critic_target = np.array(self.critic_target.model.get_weights())

        self.actor_target.model.set_weights(actor_target * (1 - self.t) +
                                            self.t * actor_current)
        self.critic_target.model.set_weights(critic_target * (1 - self.t) +
                                             self.t * critic_current)

    # Learn step of the agent, update weights of actor-critic and actor-critic target NN.
    def learn(self):
        states, actions, rewards, dones, next_states = self.er_buffer.sample_batch(
        )
        states = np.vstack(states)
        actions = np.array(actions,
                           dtype=np.float32).reshape(-1, self.action_space)
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)
        dones = np.array(dones, dtype=np.uint8).reshape(-1, 1)
        next_states = np.vstack(next_states)

        # Get action for deterministic policy.
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        next_q_values = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Need to handle the done case.
        targets = rewards + self.gamma * next_q_values * (1 - dones)
        loss = self.critic.model.train_on_batch(x=[states, actions], y=targets)
        self.losses.append(loss)

        # Getting gradients before Critics backprop.
        action_gradients = self.critic.get_action_gradients(
            [states, actions, 0])
        action_gradients_prev = action_gradients
        action_gradients = np.reshape(action_gradients[0],
                                      (-1, self.action_space))

        # Learning Phase = 0 (Test), we just want the gradient, no update on weights.
        self.actor.train_fn([states, action_gradients, 1])

        # Do soft update on weigths.
        self.soft_update()
Esempio n. 6
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        #Policy Model & Value Model
        self.actorLocal = Actor(self.state_size, self.action_size,
                                self.action_low, self.action_high)
        self.criticLocal = Critic(self.state_size, self.action_size)
        self.actorTarget = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.criticTarget = Critic(self.state_size, self.action_size)

        #Initializing target model with local model params
        self.criticTarget.model.set_weights(
            self.criticLocal.model.get_weights())
        self.actorTarget.model.set_weights(self.actorLocal.model.get_weights())

        #Replay Buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.noise = OUNoise(self.action_size, 0, 0.1, 0.25)
        self.discountGamma = 0.9

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            exp = self.memory.sample()
            self.learn(exp)
        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actorLocal.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, exp):
        """
            https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html
            Vertical Stacking of arrays
            This took a long time to get in place :). Thanks to some other references in github too for examples. 
        """
        state = np.vstack([ex.state for ex in exp if ex is not None])
        action = np.array([ex.action for ex in exp
                           if ex is not None]).reshape(-1, self.action_size)
        reward = np.array([ex.reward for ex in exp
                           if ex is not None]).reshape(-1, 1)
        done = np.array([ex.done for ex in exp
                         if ex is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [ex.next_state for ex in exp if ex is not None])

        actions_next = self.actorTarget.model.predict_on_batch(next_states)
        QTargets_next = self.criticTarget.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = reward + self.discountGamma * QTargets_next * (1 - done)
        self.criticLocal.model.train_on_batch(x=[state, action], y=Q_targets)

        actionGradients = np.reshape(
            self.criticLocal.get_action_gradients([state, action, 0]),
            (-1, self.action_size))
        self.actorLocal.train_fn([state, actionGradients, 1])

        # Soft-update target models
        self.criticTarget.model.set_weights(
            0.01 * np.array(self.criticLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.criticTarget.model.get_weights()))
        self.actorTarget.model.set_weights(
            0.01 * np.array(self.actorLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.actorTarget.model.get_weights()))