Python OUNoise.reset Examples

Programming Language: Python

Namespace/Package Name: Noise

Class/Type: OUNoise

Method/Function: reset

Examples at hotexamples.com: 3

Python OUNoise.reset - 3 examples found. These are the top rated real world Python examples of Noise.OUNoise.reset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OUNoise(9)

sample(5)

noise(3)

reset(3)

Example #1

Show file

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 1e-5  #.0001
        self.critic_lr = 1e-4  #0.0000001

        self.network = [128, 256, 128]

        self.train = train
        network = self.network
        actor_lr = self.actor_lr
        critic_lr = self.critic_lr

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     self.action_low, self.action_high,
                                     actor_lr, network)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      actor_lr, network)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr, network)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr, network)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Noise process
            self.exploration_mu = 0  # Mean
            self.exploration_theta = 0.15  #.15 How fast variable reverts to mean
            self.exploration_sigma = 0.2  #.2 Degree of volatility
            self.noise = OUNoise(self.action_size, self.exploration_mu,
                                 self.exploration_theta,
                                 self.exploration_sigma)

            # Replay memory
            self.buffer_size = 5000
            self.batch_size = 16
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
            self.targets = ReplayBuffer(self.buffer_size, self.batch_size)

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1
            # Create the TensorBoard callback,
            # which we will drive manually
            self.tensorboard = keras.callbacks.TensorBoard(
                log_dir='logdir',
                histogram_freq=0,
                batch_size=self.batch_size,
                write_graph=True,
                write_grads=True)

            self.tensorboard.set_model(self.critic_local.model)
            self.summary_writer = tf.summary.FileWriter("scores")

            self.batch_id = 0

    def reset_episode(self):
        if (self.train):
            self.noise.reset()
            self.noise_arr = []
            self.noise_matrix = [0., 0., 0., 0.]

        state = self.task.reset()
        self.last_state = state
        return state

    def save_initial_weights(self):
        self.actor_local.model.save_weights('actor_local.h5')
        self.actor_target.model.save_weights('actor_target.h5')
        self.critic_local.model.save_weights('critic_local.h5')
        self.critic_target.model.save_weights('critic_target.h5')

    def load_initial_weights(self):
        self.actor_local.model.load_weights('actor_local.h5')
        self.actor_target.model.load_weights('actor_target.h5')
        self.critic_local.model.load_weights('critic_local.h5')
        self.critic_target.model.load_weights('critic_target.h5')

    def save_model(self):
        # Save the weights
        self.actor_local.model.save_weights('model_weights.h5')

    def load_weights(self, option=None):
        if (option == None):
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('model_weights.h5')
        else:
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('weights-best.hdf5')
            print(self.trained.model.summary())

    def predict(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.trained.model.predict(state)[0]
        return action

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size * 2):
            experiences = self.memory.sample()
            self.learn(experiences)

        if (len(self.memory) == self.buffer_size):
            self.memory.memory.clear()
            print("buffer cleared")

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        noise = self.noise.sample()
        action = list(self.actor_local.model.predict(state)[0] + noise)

        return action, noise  # add some noise for exploration

    def learn(self, experiences):  #experiences
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        '''
        print("States", states.shape)
        print("actions", actions.shape)
        print("rewards", rewards.shape)
        print("dones", dones.shape)
        print("Next states", next_states.shape)
        '''
        # keep training actor local and critic local
        # use values from target model to update and train local
        # don't train target models, we soft update target

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))

        actions_next = self.actor_target.model.predict_on_batch(
            next_states)  #target

        #Actions predicted by target critic
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])  #target

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        self.tensorboard.on_epoch_end(
            self.batch_id, named_logs(self.critic_local.model, [critic_loss]))
        self.batch_id += 1

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #2

Show file

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 fc1_units,
                 fc2_units,
                 weighted=False,
                 individual=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON_MAX

        # Actor Network (w/ Target Network)
        if weighted:
            self.actor_local = Weight_adapter(state_size,
                                              action_size).to(device)
            self.actor_target = Weight_adapter(state_size,
                                               action_size).to(device)
        elif individual:
            self.actor_local = IndividualModel(state_size, action_size,
                                               random_seed,
                                               fc1_units).to(device)
            self.actor_target = IndividualModel(state_size, action_size,
                                                random_seed,
                                                fc1_units).to(device)
        else:
            self.actor_local = Actor(state_size, action_size, random_seed,
                                     fc1_units, fc2_units).to(device)
            self.actor_target = Actor(state_size, action_size, random_seed,
                                      fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size,
                             random_seed,
                             mu=0,
                             theta=0.15,
                             sigma=0.2)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > LEARN_START:
            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # Learn, if enough samples are available in memory
                if len(self.memory) > BATCH_SIZE:
                    for _ in range(UPDATES_PER_STEP):
                        experiences = self.memory.sample()
                        self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        #print(action)
        self.actor_local.train()

        if add_noise:
            tem_noise = self.noise.sample()
            action += self.epsilon * tem_noise
        # print(tem_noise, np.clip(action, -1, 1))
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
        else:
            self.epsilon = EPSILON_MIN

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

Example #3

Show file

File: Agent.py Project: dobleuber/DeepReinforcementLearningUdacity

class Agent:
    """
    Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use prioritized sample from buffer to learn.
        """

        # Save memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY

        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # Learn from memory if enough samples exist
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[i, :] = action

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices = experiences

        # update Critic
        # Get next predicted state, actions, and Q values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update priorities
        delta = abs(Q_targets - Q_expected).detach().numpy()
        self.memory.update_priorities(delta, indices)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """

        for target_model_param, local_model_param in zip(
                target_model.parameters(), local_model.parameters()):
            target_model_param.data.copy_(tau * local_model_param.data +
                                          (1. - tau) * target_model_param.data)