Ejemplo n.º 1
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 fc1_units,
                 fc2_units,
                 weighted=False,
                 individual=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON_MAX

        # Actor Network (w/ Target Network)
        if weighted:
            self.actor_local = Weight_adapter(state_size,
                                              action_size).to(device)
            self.actor_target = Weight_adapter(state_size,
                                               action_size).to(device)
        elif individual:
            self.actor_local = IndividualModel(state_size, action_size,
                                               random_seed,
                                               fc1_units).to(device)
            self.actor_target = IndividualModel(state_size, action_size,
                                                random_seed,
                                                fc1_units).to(device)
        else:
            self.actor_local = Actor(state_size, action_size, random_seed,
                                     fc1_units, fc2_units).to(device)
            self.actor_target = Actor(state_size, action_size, random_seed,
                                      fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size,
                             random_seed,
                             mu=0,
                             theta=0.15,
                             sigma=0.2)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > LEARN_START:
            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # Learn, if enough samples are available in memory
                if len(self.memory) > BATCH_SIZE:
                    for _ in range(UPDATES_PER_STEP):
                        experiences = self.memory.sample()
                        self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        #print(action)
        self.actor_local.train()

        if add_noise:
            tem_noise = self.noise.sample()
            action += self.epsilon * tem_noise
        # print(tem_noise, np.clip(action, -1, 1))
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
        else:
            self.epsilon = EPSILON_MIN

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Ejemplo n.º 2
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        #self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward

        if self.total_reward > self.best_total_reward:
            self.best_total_reward = self.total_reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class Agent:
    """
    Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use prioritized sample from buffer to learn.
        """

        # Save memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY

        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # Learn from memory if enough samples exist
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[i, :] = action

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices = experiences

        # update Critic
        # Get next predicted state, actions, and Q values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update priorities
        delta = abs(Q_targets - Q_expected).detach().numpy()
        self.memory.update_priorities(delta, indices)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """

        for target_model_param, local_model_param in zip(
                target_model.parameters(), local_model.parameters()):
            target_model_param.data.copy_(tau * local_model_param.data +
                                          (1. - tau) * target_model_param.data)
Ejemplo n.º 4
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 1e-5  #.0001
        self.critic_lr = 1e-4  #0.0000001

        self.network = [128, 256, 128]

        self.train = train
        network = self.network
        actor_lr = self.actor_lr
        critic_lr = self.critic_lr

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     self.action_low, self.action_high,
                                     actor_lr, network)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      actor_lr, network)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr, network)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr, network)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Noise process
            self.exploration_mu = 0  # Mean
            self.exploration_theta = 0.15  #.15 How fast variable reverts to mean
            self.exploration_sigma = 0.2  #.2 Degree of volatility
            self.noise = OUNoise(self.action_size, self.exploration_mu,
                                 self.exploration_theta,
                                 self.exploration_sigma)

            # Replay memory
            self.buffer_size = 5000
            self.batch_size = 16
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
            self.targets = ReplayBuffer(self.buffer_size, self.batch_size)

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1
            # Create the TensorBoard callback,
            # which we will drive manually
            self.tensorboard = keras.callbacks.TensorBoard(
                log_dir='logdir',
                histogram_freq=0,
                batch_size=self.batch_size,
                write_graph=True,
                write_grads=True)

            self.tensorboard.set_model(self.critic_local.model)
            self.summary_writer = tf.summary.FileWriter("scores")

            self.batch_id = 0

    def reset_episode(self):
        if (self.train):
            self.noise.reset()
            self.noise_arr = []
            self.noise_matrix = [0., 0., 0., 0.]

        state = self.task.reset()
        self.last_state = state
        return state

    def save_initial_weights(self):
        self.actor_local.model.save_weights('actor_local.h5')
        self.actor_target.model.save_weights('actor_target.h5')
        self.critic_local.model.save_weights('critic_local.h5')
        self.critic_target.model.save_weights('critic_target.h5')

    def load_initial_weights(self):
        self.actor_local.model.load_weights('actor_local.h5')
        self.actor_target.model.load_weights('actor_target.h5')
        self.critic_local.model.load_weights('critic_local.h5')
        self.critic_target.model.load_weights('critic_target.h5')

    def save_model(self):
        # Save the weights
        self.actor_local.model.save_weights('model_weights.h5')

    def load_weights(self, option=None):
        if (option == None):
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('model_weights.h5')
        else:
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('weights-best.hdf5')
            print(self.trained.model.summary())

    def predict(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.trained.model.predict(state)[0]
        return action

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size * 2):
            experiences = self.memory.sample()
            self.learn(experiences)

        if (len(self.memory) == self.buffer_size):
            self.memory.memory.clear()
            print("buffer cleared")

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        noise = self.noise.sample()
        action = list(self.actor_local.model.predict(state)[0] + noise)

        return action, noise  # add some noise for exploration

    def learn(self, experiences):  #experiences
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        '''
        print("States", states.shape)
        print("actions", actions.shape)
        print("rewards", rewards.shape)
        print("dones", dones.shape)
        print("Next states", next_states.shape)
        '''
        # keep training actor local and critic local
        # use values from target model to update and train local
        # don't train target models, we soft update target

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))

        actions_next = self.actor_target.model.predict_on_batch(
            next_states)  #target

        #Actions predicted by target critic
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])  #target

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        self.tensorboard.on_epoch_end(
            self.batch_id, named_logs(self.critic_local.model, [critic_loss]))
        self.batch_id += 1

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 5
0
class Agent:
    '''The most perfect DDPG Agent you have ever seen'''
    # Parameters taken from various sources
    epsilon = 0
    epsilon_min = 0
    decay = 0.9

    learn_start = 1000
    gamma = 0.99
    alpha = 0.002
    tau = 0.005

    mem_len = 1e5
    memory = deque(maxlen=int(mem_len))

    def __init__(self, env, seed):

        self.env = env
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        self.env.seed(seed)
        self.actor = self.createModel()
        self.target_actor = self.createModel()
        self.noise = OUNoise(
            self.env.action_space.shape[0], seed, theta=0.2, sigma=0.5
        )  # noise is actually OpenAI baselines OU Noise wrapped in another OUNoise function
        self.critic = self.createModel((self.env.observation_space.shape[0],
                                        self.env.action_space.shape[0]))
        self.target_critic = self.createModel(
            (self.env.observation_space.shape[0],
             self.env.action_space.shape[0]))
        self.target_critic.set_weights(self.critic.get_weights(
        ))  #ensure inital weights are equal for networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.reset()
        # return self.actor

    def createModel(self, input=None):
        '''Generate neural network models based on inputs, defaults to Actor model'''
        last_init = tf.random_uniform_initializer(
            minval=-0.003, maxval=0.003
        )  # To prevent actor network from causing steep gradients
        if input is None:
            input = self.env.observation_space.shape[0]  # Actor
            inputs = keras.layers.Input(shape=(input, ))
            hidden = keras.layers.Dense(256, activation="relu")(inputs)
            hidden = keras.layers.Dense(256, activation="relu")(hidden)
            outputs = keras.layers.Dense(1,
                                         activation="tanh",
                                         kernel_initializer=last_init)(hidden)
            model = Actor(inputs, outputs)
            lr_schedule = keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=self.alpha / 2,
                decay_steps=1e9,
                decay_rate=1
            )  #This could allow us to use decaying learning rate
            model.compile(
                loss="huber_loss", optimizer=Adam(learning_rate=lr_schedule)
            )  #Compile model with optimizer so we can apply tape.gradient later
        else:  # Critic
            input_o, input_a = input
            input1 = keras.layers.Input(shape=(input_o, ))
            input2 = keras.layers.Input(shape=(input_a, ))
            input11 = keras.layers.Dense(16, activation="relu")(input1)
            input11 = keras.layers.Dense(32, activation="relu")(input11)
            input21 = keras.layers.Dense(32, activation="relu")(input2)
            cat = keras.layers.Concatenate()([input11, input21])
            hidden = keras.layers.Dense(256, activation="relu")(cat)
            hidden = keras.layers.Dense(256, activation="relu")(hidden)
            outputs = keras.layers.Dense(1,
                                         activation="linear",
                                         kernel_initializer=last_init)(hidden)
            lr_schedule = keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=self.alpha / 1,
                decay_steps=1e9,
                decay_rate=1)
            model = Critic([input1, input2], outputs)
            model.compile(loss="mean_squared_error",
                          optimizer=Adam(
                              learning_rate=lr_schedule))  # mean_squared_error
        return model

    def replayBuffer(self, state, action, reward, next_state, terminal):
        ##TODO Implement prioritised buffer
        self.memory.append([state, action, reward, next_state, terminal])

    @tf.function  #EagerExecution for speeeed
    def replay(self, states, actions, rewards,
               next_states):  #, actor, target_actor, critic, target_critic):
        '''tf function that replays sampled experience to update actor and critic networks using gradient'''
        # Very much inspired by Keras tutorial: https://keras.io/examples/rl/ddpg_pendulum/
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states, training=True)
            q_target = rewards + self.gamma * self.target_critic(
                [next_states, target_actions], training=True)
            q_current = self.critic([states, actions], training=True)
            critic_loss = tf.math.reduce_mean(
                tf.math.square(q_target - q_current))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions_pred = self.actor(states, training=True)
            q_current = self.critic([states, actions_pred], training=True)
            actor_loss = -tf.math.reduce_mean(q_current)

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))

    @tf.function
    def update_weight(self, target_weights, weights, tau):
        '''tf function for updating the weights of selected target network'''
        for (a, b) in zip(target_weights, weights):
            a.assign(b * tau + a * (1 - tau))

    def trainTarget(self):
        '''Standard function to update target networks by tau'''
        self.update_weight(self.target_actor.variables, self.actor.variables,
                           self.tau)
        self.update_weight(self.target_critic.variables, self.critic.variables,
                           self.tau)

    def sample2batch(self, batch_size=64):
        '''Return a set of Tensor samples from the memory buffer of batch_size, default is 64'''
        # batch_size = 64

        if len(
                self.memory
        ) < batch_size:  # return nothing if not enough experiences available
            return
        # Generate batch and emtpy arrays
        samples = random.sample(self.memory, batch_size)
        next_states = np.zeros(
            (batch_size, self.env.observation_space.shape[0]))
        states = np.zeros((batch_size, self.env.observation_space.shape[0]))
        rewards = np.zeros((batch_size, 1))
        actions = np.zeros((batch_size, self.env.action_space.shape[0]))

        # Separate batch into arrays
        for idx, sample in enumerate(samples):
            state, action, reward, next_state, terminal = sample
            states[idx] = state
            actions[idx] = action
            rewards[idx] = reward
            next_states[idx] = next_state

        # Convert arrays to tensors so we can use replay as a callable TensorFlow graph
        states = tf.convert_to_tensor((states))
        rewards = tf.convert_to_tensor((rewards))
        rewards = tf.cast(rewards, dtype=tf.float32)
        actions = tf.convert_to_tensor((actions))
        next_states = tf.convert_to_tensor((next_states))

        return (states, actions, rewards, next_states)

    def train(self, state, action, reward, next_state, terminal, steps):
        '''Function call to update buffer and networks at predetermined intervals'''
        self.replayBuffer(state, action, reward, next_state,
                          terminal)  # Add new data to buffer
        if steps % 1 == 0 and len(
                self.memory) > self.learn_start:  # Sample every X steps
            samples = self.sample2batch()
            states, actions, rewards, next_states = samples
            self.replay(states, actions, rewards, next_states)
        if steps % 1 == 0:  # Update targets only every X steps
            self.trainTarget()

    def reset(self):
        self.epsilon *= self.decay
        self.epsilon = max(self.epsilon_min, self.epsilon)

    def chooseAction(self, state, scale=False):
        '''Choose action based on policy and noise function. Scale option used to limit maximum actions'''
        # self.epsilon *= self.decay
        # self.epsilon = round(max(self.epsilon / 1000, self.epsilon), 5)
        # print(state[0])
        state = tf.expand_dims(tf.convert_to_tensor(state),
                               0)  #convert to tensor for speeeed
        if np.random.random(
        ) < self.epsilon:  # If using epsilon instead of exploration noise
            return random.uniform(-1, 1)
        if scale:
            return np.clip(0.33 * (self.actor(state)) + self.noise.sample(),
                           -1, 1)
        return np.clip(1 * tf.squeeze(self.actor(state)).numpy() +
                       self.noise.sample(), -1,
                       1)  # np.argmax(self.model.predict(state))  # action