Python OUNoise.sample Examples

Programming Language: Python

Namespace/Package Name: agents.OUNoise

Class/Type: OUNoise

Method/Function: sample

Examples at hotexamples.com: 6

Python OUNoise.sample - 6 examples found. These are the top rated real world Python examples of agents.OUNoise.OUNoise.sample extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OUNoise(9)

reset(6)

sample(6)

get_noise_sample(1)

restart(1)

Example #1

Show file

File: agent.py Project: PhysCoder/MOOC-projects

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 task,
                 explore_mu=0,
                 explore_theta=0.15,
                 explore_sigma=0.2,
                 gamma=0.99,
                 tau=0.01):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = explore_mu  # 0
        self.exploration_theta = explore_theta  # 0.15
        self.exploration_sigma = explore_sigma  # 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # 0.99  # discount factor
        self.tau = gamma  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #2

Show file

class agentDDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.rotor_high = task.action_high
        self.rotor_low = task.action_low

        # We will update local agent continuously and intermittantly copy the weights to target agent
        self.actor_local = actor(self.state_size,
                                 self.action_size,
                                 h1=64,
                                 h2=32,
                                 lr=0.001,
                                 r_h=self.rotor_high,
                                 r_l=self.rotor_low)
        self.actor_target = actor(self.state_size,
                                  self.action_size,
                                  h1=64,
                                  h2=32,
                                  lr=0.001,
                                  r_h=self.rotor_high,
                                  r_l=self.rotor_low)

        self.critic_local = critic(self.state_size,
                                   self.action_size,
                                   h1=32,
                                   h2=24,
                                   lr=0.001)
        self.critic_target = critic(self.state_size,
                                    self.action_size,
                                    h1=32,
                                    h2=24,
                                    lr=0.001)

        # Make the weights of both local and target agent same
        self.actor_target.actorModel.set_weights(
            self.actor_local.actorModel.get_weights())
        self.critic_target.criticModel.set_weights(
            self.critic_local.criticModel.get_weights())

        self.mu = 0
        self.sigma = 0.15
        self.theta = 0.2
        self.OUNoise = OUNoise(self.action_size, self.mu, self.sigma,
                               self.theta)

        self.bufferSize = 100000
        self.batch_size = 64
        self.memory = memoryBuffer(self.bufferSize, self.batch_size)

        self.gamma = 0.99
        self.tau = 0.01

    def reset_episode(self):
        self.OUNoise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if self.memory.len() > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def learn(self, experience):
        states = np.vstack([e.state for e in experience if e is not None])
        actions = np.array([e.action for e in experience
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experience if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experience if e is not None])
        done = np.array([e.done for e in experience
                         if e is not None]).astype(np.uint8).reshape(-1, 1)

        # Train actor agent based on the action_gradient received from critic
        # Train critic agent based on TD error
        actions_next = self.actor_local.actorModel.predict_on_batch(
            next_states)
        Q_targets_next = self.critic_local.criticModel.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)
        self.critic_local.criticModel.train_on_batch(x=[states, actions],
                                                     y=Q_targets)

        action_gradients = self.critic_local.get_action_gradients(
            inputs=[states, actions, 0])
        action_gradients = np.reshape(action_gradients, (-1, self.action_size))

        self.actor_local.train_actor(inputs=[states, action_gradients, 1])

        self.soft_update(self.actor_local.actorModel,
                         self.actor_target.actorModel)
        self.soft_update(self.critic_local.criticModel,
                         self.critic_target.criticModel)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = local_weights * self.tau + target_weights * (1 -
                                                                   self.tau)
        target_model.set_weights(new_weights)

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.actorModel.predict(state)[0]
        return list(action + self.OUNoise.sample())

Example #3

Show file

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        print('loaded DDPG ')
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        self.w = np.random.normal(size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
        scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
        
        # Episode variables
#         self.reset_episode()
        
        #load weight if existing
        
        
        

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise_scale = 0.1
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        
        if done:
            self.score_update()

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model) 
        
        
        
        
        
        
    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
        
        
    def score_update(self):
        # Learn by random policy search, using a reward-based score
        self.score = self.total_reward / float(self.count) if self.count else 0.0
        

        
        if self.score > self.best_score:
            self.best_score = self.score
            self.best_w = self.w
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        else:
            self.w = self.best_w
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

Example #4

Show file

class DDPG():
    '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient'''
    def __init__(self, task):
        '''
        Params
        ======
        task (object)   : environment

        '''
        '''
        Reference: Continuous Control With Deep Reinforcement Learning(2016)
        Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras
        =========
        gamma   : 0.99
        tau     : 0.001
        buffer_size (ReplayBuffer)  : 1e6
        batch_size (ReplayBuffer)   : 64
        theta (Ornstein-Uhlenbeck process)  : 0.15
        sigma (Ornstein-Uhlenbeck process)  : 0.2


        '''

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # actor (policy) model - use two copies of model for updating model and producing target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # critic (value) model - use two copies of model for updating model and producing target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # reward history
        self.best_avg_score = -np.inf
        self.accumulated_reward = 0
        self.count = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.accumulated_reward = 0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # save experience and reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # roll over last state and action
        self.last_state = next_state

        # accumulate reward
        self.accumulated_reward += reward
        self.count += 1

        # record best average score
        if done:
            if float(self.accumulated_reward /
                     self.count) > self.best_avg_score:
                self.best_avg_score = float(self.accumulated_reward /
                                            self.count)

    def act(self, state):
        '''returns actions for given state(s) as per current policy'''
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration
        # both action and self.noise.sample() are numpy object, + means sum up both,
        # instead of concatenation

    def learn(self, experiences):
        '''update policy and value parameters using given batch of experience tuples'''
        # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).\
                           astype(np.uint8).reshape(-1,1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next-state actions and Q-values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # compute Q targets for current states and train critic model (local)
        # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)²
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train actor model (local)
        # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function
        # The learning phase flag is a bool tensor (0 = test, 1 = train)
        # to be passed as input to any Keras function
        # that uses a different behavior at train time and test time.

        # soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        '''soft update model parameters'''
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            'Local and target model parameters must have the same size'

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #5

Show file

File: agent.py Project: Messan/Deep-Learning-Nanodegree-Udacity

class DDPG_Agent:
    def __init__(self, task, noise, memory, rl_param, nn_hidden, actor_lr,
                 critic_lr, q_lambda):
        # Adapted for this gym
        self.task = task
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.state_space = task.state_size
        self.action_space = task.action_size
        self.q_lambda = q_lambda

        # Instantiate Actors and Critics.
        self.actor = Actor(self.state_space,
                           self.action_space,
                           self.action_low,
                           self.action_high,
                           hidden_units=nn_hidden[0],
                           learning_rate=actor_lr,
                           q_lambda=q_lambda)
        self.actor_target = Actor(self.state_space,
                                  self.action_space,
                                  self.action_low,
                                  self.action_high,
                                  hidden_units=nn_hidden[0],
                                  learning_rate=actor_lr,
                                  q_lambda=q_lambda)

        self.critic = Critic(self.state_space,
                             self.action_space,
                             hidden_units=nn_hidden[1],
                             learning_rate=critic_lr,
                             q_lambda=q_lambda)
        self.critic_target = Critic(self.state_space,
                                    self.action_space,
                                    hidden_units=nn_hidden[1],
                                    learning_rate=critic_lr,
                                    q_lambda=q_lambda)

        # Set same weights in target.
        self.actor_target.model.set_weights(self.actor.model.get_weights())
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        # Noise for exploration.
        self.mean = noise[0]
        self.sigma = noise[1]
        self.theta = noise[2]
        self.ounoise = OUNoise(self.action_space, self.mean, self.sigma,
                               self.theta)

        # Experience Replay memory.
        self.capacity = memory[0]
        self.batch_size = memory[1]
        self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity,
                                                batch_size=self.batch_size)

        # RL parameters.
        self.gamma = rl_param[0]
        self.t = rl_param[1]

        # Keeping track of learning.
        self.learning_rewards = list()
        self.total_reward = None
        self.best_reward = -np.inf
        self.losses = list()

    def restart_task(self):
        if self.total_reward is not None:
            self.learning_rewards.append(self.total_reward)
            if self.total_reward > self.best_reward:
                self.best_reward = self.total_reward
        self.total_reward = 0
        state = self.task.reset()
        self.state = state
        self.ounoise.restart()
        return state

    def act(self, state, epsilon):
        self.action_wo_noise = self.actor.model.predict(
            np.reshape(state, newshape=(-1, self.state_space)))
        self.step_noise = self.ounoise.sample() * epsilon
        action = np.array(self.action_wo_noise[0] +
                          self.step_noise[0]).reshape(-1, self.action_space)
        action_clipped = np.clip(a=action,
                                 a_min=self.action_low,
                                 a_max=self.action_high)
        return action_clipped

    # Saves expirience into memory and updates actor-critic weights.
    def store_learn(self, state, action, reward, done, next_state):

        # Store experience into exp replay memory.
        self.er_buffer.add_env_reaction(
            (state, action, reward, done, next_state))

        # Learn if agent has enough experiences.
        if len(self.er_buffer.mem) > self.batch_size:
            self.learn()

        self.total_reward += reward
        # Update to the current state of the enviroment.
        self.state = next_state

    def soft_update(self):
        actor_current = np.array(self.actor.model.get_weights())
        critic_current = np.array(self.critic.model.get_weights())

        actor_target = np.array(self.actor_target.model.get_weights())
        critic_target = np.array(self.critic_target.model.get_weights())

        self.actor_target.model.set_weights(actor_target * (1 - self.t) +
                                            self.t * actor_current)
        self.critic_target.model.set_weights(critic_target * (1 - self.t) +
                                             self.t * critic_current)

    # Learn step of the agent, update weights of actor-critic and actor-critic target NN.
    def learn(self):
        states, actions, rewards, dones, next_states = self.er_buffer.sample_batch(
        )
        states = np.vstack(states)
        actions = np.array(actions,
                           dtype=np.float32).reshape(-1, self.action_space)
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)
        dones = np.array(dones, dtype=np.uint8).reshape(-1, 1)
        next_states = np.vstack(next_states)

        # Get action for deterministic policy.
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        next_q_values = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Need to handle the done case.
        targets = rewards + self.gamma * next_q_values * (1 - dones)
        loss = self.critic.model.train_on_batch(x=[states, actions], y=targets)
        self.losses.append(loss)

        # Getting gradients before Critics backprop.
        action_gradients = self.critic.get_action_gradients(
            [states, actions, 0])
        action_gradients_prev = action_gradients
        action_gradients = np.reshape(action_gradients[0],
                                      (-1, self.action_space))

        # Learning Phase = 0 (Test), we just want the gradient, no update on weights.
        self.actor.train_fn([states, action_gradients, 1])

        # Do soft update on weigths.
        self.soft_update()

Example #6

Show file

class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        #Policy Model & Value Model
        self.actorLocal = Actor(self.state_size, self.action_size,
                                self.action_low, self.action_high)
        self.criticLocal = Critic(self.state_size, self.action_size)
        self.actorTarget = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.criticTarget = Critic(self.state_size, self.action_size)

        #Initializing target model with local model params
        self.criticTarget.model.set_weights(
            self.criticLocal.model.get_weights())
        self.actorTarget.model.set_weights(self.actorLocal.model.get_weights())

        #Replay Buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.noise = OUNoise(self.action_size, 0, 0.1, 0.25)
        self.discountGamma = 0.9

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            exp = self.memory.sample()
            self.learn(exp)
        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actorLocal.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, exp):
        """
            https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html
            Vertical Stacking of arrays
            This took a long time to get in place :). Thanks to some other references in github too for examples. 
        """
        state = np.vstack([ex.state for ex in exp if ex is not None])
        action = np.array([ex.action for ex in exp
                           if ex is not None]).reshape(-1, self.action_size)
        reward = np.array([ex.reward for ex in exp
                           if ex is not None]).reshape(-1, 1)
        done = np.array([ex.done for ex in exp
                         if ex is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [ex.next_state for ex in exp if ex is not None])

        actions_next = self.actorTarget.model.predict_on_batch(next_states)
        QTargets_next = self.criticTarget.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = reward + self.discountGamma * QTargets_next * (1 - done)
        self.criticLocal.model.train_on_batch(x=[state, action], y=Q_targets)

        actionGradients = np.reshape(
            self.criticLocal.get_action_gradients([state, action, 0]),
            (-1, self.action_size))
        self.actorLocal.train_fn([state, actionGradients, 1])

        # Soft-update target models
        self.criticTarget.model.set_weights(
            0.01 * np.array(self.criticLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.criticTarget.model.get_weights()))
        self.actorTarget.model.set_weights(
            0.01 * np.array(self.actorLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.actorTarget.model.get_weights()))