class AgentResourcePool():
        def __init__(self, state_size, action_size, random_seed):

            self.actor_local = Actor(state_size, action_size,
                                     random_seed).to(device)
            self.actor_target = Actor(state_size, action_size,
                                      random_seed).to(device)
            self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                              lr=LR_ACTOR)

            # Critic Network (w/ Target Network)
            self.critic_local = Critic(state_size, action_size,
                                       random_seed).to(device)
            self.critic_target = Critic(state_size, action_size,
                                        random_seed).to(device)
            self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                               lr=LR_CRITIC,
                                               weight_decay=WEIGHT_DECAY)
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)
Esempio n. 2
0
class Agent():
    """Interacts with and learns from the environment."""
    memory = None
    actor_local = None
    actor_target = None
    actor_optimizer = None

    critic_local = None
    critic_target = None
    critic_optimizer = None

    instances = []

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # initialize Class level Actor Network
        #         if Agent.actor_local is None:
        #             Agent.actor_local = Actor(state_size, action_size, random_seed).to(device)
        #         if Agent.actor_target is None:
        #             Agent.actor_target = Actor(state_size, action_size, random_seed).to(device)
        #         if Agent.actor_optimizer is None:
        #             Agent.actor_optimizer = optim.Adam(Agent.actor_local.parameters(), lr=LR_ACTOR)
        #         self.actor_local = Agent.actor_local
        #         self.actor_target = Agent.actor_target
        #         self.actor_optimizer = Agent.actor_optimizer

        # Critic Network (w/ Target Network)
        #         self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        #         self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        #         self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Initilise Class levell Critic Network
        if Agent.critic_local is None:
            Agent.critic_local = Critic(state_size, action_size,
                                        random_seed).to(device)
        if Agent.critic_target is None:
            Agent.critic_target = Critic(state_size, action_size,
                                         random_seed).to(device)
        if Agent.critic_optimizer is None:
            Agent.critic_optimizer = optim.Adam(
                Agent.critic_local.parameters(),
                lr=LR_CRITIC,
                weight_decay=WEIGHT_DECAY)
        self.critic_local = Agent.critic_local
        self.critic_target = Agent.critic_target
        self.critic_optimizer = Agent.critic_optimizer

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory - only intitialise once per class
        if Agent.memory is None:
            print("Initialising ReplayBuffer")
            Agent.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                        random_seed)
#         else:
#             print("Sharing ReplayBuffer %s", Agent.memory)

# Add this instances - we need to access all agent states whilst learning
        self.agent_num = len(Agent.instances)
        Agent.instances.append(self)
        print("Appended to Agent.instances agent {}".format(self.agent_num))

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        Agent.memory.add(state, action, reward, next_state, done)

        # only learn every n_time_steps
        if time_step % N_TIME_STEPS != 0:
            return

        # Learn, if enough samples are available in memory
        if len(Agent.memory) > BATCH_SIZE:
            for i in range(N_LEARN_UPDATES):
                experiences = Agent.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True, noise_amplitude=0.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * noise_amplitude
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 3
0
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 4
0
class DDPG_trainer(object):
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def select_action(self, s_t, decay_epsilon=True):

        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= DELTA_EPSILON

        self.a_t = action
        return action

    def reset(self, observation):
        self.start_state = observation
        self.random_process.reset_states()

    def observe(self, r_t, s_t1, done):

        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_all(self):
        # Help Warm Up
        if self.memory.nb_entries < BATCH_SIZE * 2:
            return

        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch)),
            ])

        target_q_batch = to_tensor(reward_batch) + \
                         DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()
        for state in state_batch:
            if state.shape[0] <= 2:
                # print("Error sampled memory!")
                return

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = CRITERION(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, TAU)
        soft_update(self.critic_target, self.critic, TAU)
Esempio n. 5
0
class DDPGAgent():
    """DDPG agent that interacts with and learns from the environment.

    The agents model is implemented in 'ddpg_model.py'. It consists of two
    neural networks; one for the actor, and one for the critic.

    The DDPGAgent class makes use of two other classes: ReplayBuffer, OUNoise
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Arguments:
            state_size (int) -- dimension of each state
            action_size (int) -- dimension of each action
            num_agents (int) -- number of agents (brains)
            random_seed (int) -- random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        ### Make neural networks (local and target) for both actor and critic, and set optimizers
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Initialize replay memory ###
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience in memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_EVERY time steps
        if timestep % UPDATE_EVERY == 0:
            # If we have collected enough experience in our memory i.e. more
            # than the mini-batch size, then call the self.learn() function
            if len(self.memory) > BATCH_SIZE:
                # Number of updates per timestep
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy.

        Arguments:
            state {[type]} -- Current state
            add_noise {bool} -- Add noise (exploration) to the actions (default: {True})

        Returns:
            [float] -- Actions
        """

        # Convert 'state' numpy array to pytorch tensor using the current device
        # i.e. GPU or CPU.
        state = torch.from_numpy(state).float().to(device)

        # Set the module in evaluation mode.
        self.actor_local.eval()
        with torch.no_grad():
            # Evaluate the network with the current state
            action = self.actor_local(state).cpu().data.numpy()

        # Set the module in training mode.
        self.actor_local.train()
        if add_noise:
            # Add noise to the actions to add exploration
            action += self.noise.sample()

        # Return the clipped actions
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Arguments:
            experiences {Tuple[torch.Tensor]} -- tuple of (s, a, r, s', done) tuples
            gamma {float} -- discount factor
        """

        # Experiences, mini-batch of 128
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- Update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradients
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        # Take one step with the optimizer
        self.critic_optimizer.step()

        # ---------------------------- Update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- Update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Arguments:
            local_model -- PyTorch model (weights will be copied from)
            target_model -- PyTorch model (weights will be copied to)
            tau (float) -- interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 6
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):
        # self.cuda = USE_CUDA #args.cuda
        self.cuda = args.cuda

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        #Init models
        #actor_kwargs = {'n_inp':self.nb_states, 'n_feature_list':[args.hidden1,args.hidden2], 'n_class':self.nb_actions}
        #self.actor = MLP(**actor_kwargs)
        #self.actor_target = MLP(**actor_kwargs)
        #self.critic = MLP(**actor_kwargs)  #TODO: actor and critic has same structure for now.
        #self.critic_target = MLP(**actor_kwargs)

        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)

        self.criterion = nn.MSELoss()
        if self.cuda:
            self.actor = self.actor.cuda(
            )  # torch.nn.DataParallel(self.model).cuda()  #TODO dataparallel not working
            self.critic = self.critic.cuda()
            self.actor_target = self.actor_target.cuda()
            self.critic_target = self.critic_target.cuda()
            self.criterion = self.criterion.cuda()

        # Set optimizer
        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=args.prate)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=args.rate)
        # Loss function
        self.loss_fn = torch.nn.MSELoss(size_average=False)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = self.criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_wts(self, modelfile):
        if os.path.isfile(modelfile + 'model.pth.tar'):
            checkpoint = torch.load(modelfile)
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.actor_optim.load_state_dict(checkpoint['actor_optim'])
            self.critic_optim.load_state_dict(checkpoint['critic_optim'])
            return checkpoint['step']
        else:
            return 0

    def save_wts(self, savefile, step):
        saveme = {  #TODO save other stuff too, like epoch etc
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optim': self.actor_optim.state_dict(),
            'critic_optim': self.critic_optim.state_dict(),
            'step': step
        }
        torch.save(saveme, savefile + 'model.pth.tar')
Esempio n. 7
0
class DDPGPolicy(object):
	def __init__(self,env_name, policy_config,device = 'cpu'):
		self.device = device
		self.env = gym.make(env_name) #仅仅用于设置observation
		self.obs_dim = self.env.observation_space.shape[0]
		if isinstance(self.env.action_space, gym.spaces.Box):
			self.action_dim = self.env.action_space.shape[0]
		elif isinstance(self.env.action_space, gym.spaces.Discrete):
			raise TypeError('Unsupported action type')
		else:
			raise ValueError('unsupport action ', type(self.action_dim))

		self.action_limit = self.env.action_space.high[0]
		self.lr = policy_config['lr']
		self.actor = Actor(self.obs_dim, self.action_dim).to(device)
		self.critic = Critic(self.obs_dim, self.action_dim).to(device)
		self.actor_target = deepcopy(self.actor)
		self.critic_target = deepcopy(self.critic)

		hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
		hard_update(self.critic_target, self.critic)

		self.actor_optim =  torch.optim.Adam(params=self.actor.parameters(), lr=0.001)
		self.critic_optim =  torch.optim.Adam(params=self.critic.parameters(), lr=0.001)
		self.discount_factor = policy_config['discount_factor']
		self.tau = 0.005


	def train_on_batch(self,rollouts_batch):
		# loss = r+q(st) - q (st+1), minimums loss
		obs, acs, next_obs, dones, r, un_r, summed_r = convert_listofrollouts(paths=rollouts_batch)
		acs = torch.tensor(acs).float().to(self.device)
		obs = torch.FloatTensor(obs).to(self.device)
		next_obs = torch.FloatTensor(next_obs).to(self.device)
		#acs_one_hot = torch.eye(2).to(self.device).index_select(0,acs)# to one hot discrete action space
		dones = torch.IntTensor(dones).to(self.device)
		r = torch.FloatTensor(r).to(self.device)

		# update critic
		self.critic_optim.zero_grad()
		act_target = self.actor_target(next_obs).to(self.device)
		q_target = r + self.discount_factor * self.critic_target(next_obs,act_target)* (1-dones)
		q_pred = self.critic(obs,acs)
		critic_loss = torch.nn.functional.mse_loss(q_pred,q_target)
		critic_loss.backward()
		self.critic_optim.step()

		#update actor
		self.actor_optim.zero_grad()
		actor_loss = -torch.mean(self.critic(obs,self.actor(obs)))
		actor_loss.backward()
		self.actor_optim.step()

		info = {'loss': actor_loss.cpu().detach().numpy(),  # scale
				'model_out': q_target,  # torch.tensor [sum(batch), ac_dim],
				}
		return info

	def update_target_network(self):
		soft_update(self.actor_target, self.actor, self.tau)
		soft_update(self.critic_target, self.critic, self.tau)

	def get_weights(self):
		#TODO: actor and critic parameters
		return {k:v for k,v in self.actor.state_dict().items()}

	def set_weights(self,weights):
		self.actor.load_state_dict(weights)

	def compute_actions(self, obs, noise_scale): #通过noise来判断是否需要增加噪声,如果是在eval中noise为0
		obs = obs.to(self.device)
		actions = self.actor(obs).cpu().detach().numpy()
		actions += noise_scale * np.random.rand(self.action_dim)
		actions = np.clip(actions, -self.action_limit, self.action_limit)

		return actions

	def reset(self):#在env.reset的同时需要reset random_process
		self.random_process.reset_states()
Esempio n. 8
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 device='cpu',
                 params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0],
                                 params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0],
                                  params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, params['seed'],
                                   params['critic_units'][0],
                                   params['critic_units'][1]).to(device)
        self.critic_target = Critic(state_size, action_size, params['seed'],
                                    params['critic_units'][0],
                                    params['critic_units'][1]).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size,
                             params['seed'],
                             theta=params['noise_theta'],
                             sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_weights(self, filenames):
        """Store weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to store weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        torch.save(self.actor_local.state_dict(), filenames[0])
        torch.save(self.critic_local.state_dict(), filenames[1])

    def load_weights(self, filenames):
        """Load weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to load weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        self.actor_local.load_state_dict(torch.load(filenames[0]))
        self.critic_local.load_state_dict(torch.load(filenames[1]))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(
                self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 9
0
class Agent():
    def __init__(self, n_state, n_action, n_agents, random_seed, device="cpu"):
        """Initialize an Agent object.
        
        Params
        ------
            n_state : int
                dimension of each state
            n_action : int
                dimension of each action
            random_seed : int
                random seed
            device :
                which device is used, cpu or cuda.
        """
        self.n_state = n_state
        self.n_action = n_action
        self.n_agents = n_agents
        self.random_seed = np.random.seed(random_seed)
        self.device = device

        # Networks for the first agent
        # Local Actor, Local Critic, Target Actor, Target Critic
        self.actor_local1 = Actor(self.n_state, self.n_action,
                                  self.random_seed).to(self.device)
        self.actor_local1.apply(initialize_weights)
        self.critic_local1 = Critic(self.n_state * self.n_agents,
                                    self.n_action * self.n_agents,
                                    self.random_seed).to(self.device)
        self.critic_local1.apply(initialize_weights)
        self.actor_target1 = Actor(self.n_state, self.n_action,
                                   self.random_seed).to(self.device)
        self.actor_target1.apply(initialize_weights)
        self.actor_target1.eval()
        self.critic_target1 = Critic(self.n_state * self.n_agents,
                                     self.n_action * self.n_agents,
                                     self.random_seed).to(self.device)
        self.critic_target1.apply(initialize_weights)
        self.critic_target1.eval()

        # Networks for the second agent
        # Local Actor, Local Critic, Target Actor, Target Critic
        self.actor_local2 = Actor(self.n_state, self.n_action,
                                  self.random_seed).to(self.device)
        self.actor_local2.apply(initialize_weights)
        self.critic_local2 = Critic(self.n_state * self.n_agents,
                                    self.n_action * self.n_agents,
                                    self.random_seed).to(self.device)
        self.critic_local2.apply(initialize_weights)
        self.actor_target2 = Actor(self.n_state, self.n_action,
                                   self.random_seed).to(self.device)
        self.actor_target2.apply(initialize_weights)
        self.actor_target2.eval()
        self.critic_target2 = Critic(self.n_state * self.n_agents,
                                     self.n_action * self.n_agents,
                                     self.random_seed).to(self.device)
        self.actor_target2.apply(initialize_weights)
        self.critic_target2.eval()

        # optimizers
        self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(),
                                           lr=LR_ACTOR)
        self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(),
                                           lr=LR_ACTOR)
        self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)
        self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(n_action * 2,
                             random_seed + 1,
                             mu=0.,
                             theta=THETA,
                             sigma=SIGMA)

        # Replay Buffer
        self.memory = ReplayBuffer(n_action, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed + 2, self.device)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        pass
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        # Learn, if enough samples are available in memory
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(N_LEARNING):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        state0 = torch.from_numpy(state[0]).unsqueeze(dim=0).float().to(
            self.device)
        state1 = torch.from_numpy(state[1]).unsqueeze(dim=0).float().to(
            self.device)

        self.actor_local1.eval()
        self.actor_local2.eval()
        with torch.no_grad():
            action0 = self.actor_local1(state0).cpu().data.numpy()
            action1 = self.actor_local2(state1).cpu().data.numpy()

        action = np.vstack([action0, action1])
        self.actor_local1.train()
        self.actor_local2.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next1 = self.actor_target1(next_states[:, 0:24])
            actions_next2 = self.actor_target2(next_states[:, 24:])

            actions_next = torch.cat((actions_next1, actions_next2), dim=1)
            Q_targets_next1 = self.critic_target1(next_states, actions_next)
            Q_targets_next2 = self.critic_target2(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets1 = rewards[:, 0].unsqueeze(
            dim=1) + (gamma * Q_targets_next1 *
                      (1 - dones[:, 0].unsqueeze(dim=1)))
        Q_targets2 = rewards[:, 1].unsqueeze(
            dim=1) + (gamma * Q_targets_next2 *
                      (1 - dones[:, 1].unsqueeze(dim=1)))

        # Compute critic loss
        Q_expected1 = self.critic_local1(states, actions)
        Q_expected2 = self.critic_local2(states, actions)

        critic_loss1 = F.mse_loss(Q_expected1, Q_targets1.detach())
        critic_loss2 = F.mse_loss(Q_expected2, Q_targets2.detach())
        # Minimize the loss
        self.critic_optimizer1.zero_grad()
        critic_loss1.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local1.parameters(), 1)
        self.critic_optimizer1.step()

        self.critic_optimizer2.zero_grad()
        critic_loss2.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local2.parameters(), 1)
        self.critic_optimizer2.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred1 = self.actor_local1(states[:, 0:24])
        actions_pred2 = self.actor_local2(states[:, 24:])
        actions_pred = torch.cat((actions_pred1, actions_pred2), dim=1)

        actor_loss1 = -self.critic_local1(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer1.zero_grad()
        actor_loss1.backward(retain_graph=True)
        self.actor_optimizer1.step()

        actor_loss2 = -self.critic_local2(states, actions_pred).mean()
        self.actor_optimizer2.zero_grad()
        actor_loss2.backward(retain_graph=True)
        self.actor_optimizer2.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local1, self.critic_target1, TAU)
        self.soft_update(self.actor_local1, self.actor_target1, TAU)
        self.soft_update(self.critic_local2, self.critic_target2, TAU)
        self.soft_update(self.actor_local2, self.actor_target2, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters

        Arguments
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 10
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Prioritized replay memory
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                for _ in range(10):  # update 10 times per learning
                    idxes, experiences, is_weights = self.prioritized_memory.sample(
                        device)
                    self.learn(experiences,
                               GAMMA,
                               is_weights=is_weights,
                               leaf_idxes=idxes)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        rewards = rewards  # TODO: rewards are clipped to be in [-1,1]

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss

        Q_expected = self.critic_local(states, actions)
        td_errors = (
            Q_targets -
            Q_expected).tanh()  # TD-errors are clipped to be in [-1,1]
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        c_loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights
        self.running_c_loss += float(c_loss.cpu().data.numpy())
        self.training_cnt += 1

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        c_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradient to max 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        a_loss = self.critic_local(states, actions_pred)
        a_loss = -a_loss.mean()
        self.running_a_loss += float(a_loss.cpu().data.numpy())

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        a_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(),
                                       1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 11
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.TAU = 1e-2
        self.gamma = 0.99
        self.BUFFER_SIZE = int(1e6)
        self.BATCH_SIZE = 1024
        self.LR_CRITIC = 1e-3
        self.LR_ACTOR = 1e-3
        self.WEIGHT_DECAY = 0.0
        self.EPSILON = 1.0
        self.EPSILON_DECAY = 0.99

        # Actor network (w/ target network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)

    def act(self, state, add_noise=True):
        """ Given a state choose an action
        Params
        ======
            state (float ndarray): state of the environment        
        """

        state = torch.from_numpy(state).unsqueeze(0).float().to(device)
        self.actor_local.eval(
        )  # set network on eval mode, this has any effect only on certain modules (Dropout, BatchNorm, etc.)
        with torch.no_grad():
            action = self.actor_local(state).cpu().squeeze(0).data.numpy()

        self.actor_local.train()  # set nework on train mode
        if add_noise:
            action += self.noise.sample() * self.EPSILON

        return np.clip(action, -1, 1)

    def reset(self):

        self.noise.reset()

    def learn(self, experiences):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        self.EPSILON *= self.EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 12
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(10):  # update 10 times per learning
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.running_c_loss += float(critic_loss.cpu().data.numpy())
        self.training_cnt += 1
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.running_a_loss += float(actor_loss.cpu().data.numpy())
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, num_agents, state_size, action_size, random_seed,
                 buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic,
                 weight_decay, a_hidden_sizes, c_hidden_sizes):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.ACTOR_HL_SIZE = a_hidden_sizes
        self.CRITIC_HL_SIZE = c_hidden_sizes
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local_1 = Actor(state_size, action_size, random_seed,
                                   self.ACTOR_HL_SIZE).to(device)
        self.actor_target_1 = Actor(state_size, action_size, random_seed,
                                    self.ACTOR_HL_SIZE).to(device)
        self.actor_optimizer_1 = optim.Adam(self.actor_local_1.parameters(),
                                            lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local_1 = Critic(state_size, action_size, random_seed,
                                     self.CRITIC_HL_SIZE).to(device)
        self.critic_target_1 = Critic(state_size, action_size, random_seed,
                                      self.CRITIC_HL_SIZE).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_local_1.parameters(),
                                             lr=self.LR_CRITIC,
                                             weight_decay=self.WEIGHT_DECAY)

        # Actor Network (w/ Target Network)
        self.actor_local_2 = Actor(state_size, action_size, random_seed,
                                   self.ACTOR_HL_SIZE).to(device)
        self.actor_target_2 = Actor(state_size, action_size, random_seed,
                                    self.ACTOR_HL_SIZE).to(device)
        self.actor_optimizer_2 = optim.Adam(self.actor_local_2.parameters(),
                                            lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local_2 = Critic(state_size, action_size, random_seed,
                                     self.CRITIC_HL_SIZE).to(device)
        self.critic_target_2 = Critic(state_size, action_size, random_seed,
                                      self.CRITIC_HL_SIZE).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
                                             lr=self.LR_CRITIC,
                                             weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(states.shape[0]):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, self.GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local_1.eval()
        self.actor_local_2.eval()
        action_values = [states.shape[0], self.action_size]
        with torch.no_grad():
            action_values[0] = self.actor_local_1(states[0]).cpu().data.numpy()
            action_values[1] = self.actor_local_2(states[1]).cpu().data.numpy()
        self.actor_local_1.train()
        self.actor_local_2.train()

        #print (action_values)
        if add_noise:
            action_values += self.noise.sample()
        #print (action_values)
        #print (np.clip(action_values, -1, 1))
        return np.clip(action_values, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next_1 = self.actor_target_1(next_states)
        actions_next_2 = self.actor_target_2(next_states)
        Q_targets_next_1 = self.critic_target_1(next_states,
                                                actions_next_1.detach())
        Q_targets_next_2 = self.critic_target_2(next_states,
                                                actions_next_2.detach())
        # Compute Q targets for current states (y_i)
        Q_targets_1 = rewards + (gamma * Q_targets_next_1 * (1 - dones))
        Q_targets_2 = rewards + (gamma * Q_targets_next_2 * (1 - dones))
        # Compute critic loss
        Q_expected_1 = self.critic_local_1(states, actions)
        Q_expected_2 = self.critic_local_2(states, actions)
        critic_loss_1 = F.mse_loss(Q_expected_1, Q_targets_1.detach())
        critic_loss_2 = F.mse_loss(Q_expected_2, Q_targets_2.detach())
        # Minimize the loss
        self.critic_optimizer_1.zero_grad()
        self.critic_optimizer_2.zero_grad()
        critic_loss_1.backward()
        critic_loss_2.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer_1.step()
        self.critic_optimizer_2.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred_1 = self.actor_local_1(states)
        actions_pred_2 = self.actor_local_2(states)
        actor_loss_1 = -self.critic_local_1(states, actions_pred_1).mean()
        actor_loss_2 = -self.critic_local_2(states, actions_pred_2).mean()
        # Minimize the loss
        self.actor_optimizer_1.zero_grad()
        self.actor_optimizer_2.zero_grad()
        actor_loss_1.backward()
        actor_loss_2.backward()
        self.actor_optimizer_1.step()
        self.actor_optimizer_2.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local_1, self.critic_target_1, self.TAU)
        self.soft_update(self.critic_local_2, self.critic_target_2, self.TAU)
        self.soft_update(self.actor_local_1, self.actor_target_1, self.TAU)
        self.soft_update(self.actor_local_2, self.actor_target_2, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 14
0
class Agent:
    def __init__(self, state_size=OBS_DIM, action_size=ACT_DIM, random_seed=0):
        """Initialize an Agent object.
        Params
        =====
            state_size (int): dimension of all observation
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        self.noise = OUNoise(action_size, random_seed)
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, state, action, reward, next_state):
        """Save an experience in replay buffer and use random samples from buffer to learn."""
        self.memory.add(state, action, reward, next_state)

        if len(self.memory
               ) > BATCH_SIZE:  # begin to learn when replay buffer is full
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Return actions for given state as per current policy."""
        state = state[None, :]
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples
        Q_target = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q_value

        Params
        =====
            experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states = experiences

        # ----------------- update critic network weights ---------------- #
        # get predicted next_state actions and Q_values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states
        q_targets = rewards + gamma * q_targets_next
        # compute critic loss
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------- update actor network weights ---------------- #
        # compute the loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------- update target networks ------------------ #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ----------------- update noise -------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters
        θ_target = τ * θ_local + (1 - τ) * θ_target
        Params
        =====
            local_model: Network weights to be copied from
            target_model: Network weights to be copied to
            tau(float): interpolation parameter
        """
        for local_param, target_param in zip(local_model.parameters(),
                                             target_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def restore(self, save_path):
        actor_checkpoint = torch.load(save_path + '/checkpoint_actor.pth')
        self.actor_local.load_state_dict(actor_checkpoint)
        critic_checkpoint = torch.load(save_path + '/checkpoint_critic.pth')
        self.actor_local.load_state_dict(critic_checkpoint)
        print('Successfully load network weights!')
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 brain_name,
                 seed,
                 params=default_params,
                 device=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        params = self._fill_params(params)

        # implementation and identity
        self.device = device if device is not None else torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.name = params['name']
        self.brain_name = brain_name

        # set environment information
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 fc1_units=params['layers_actor'][0],
                                 fc2_units=params['layers_actor'][1]).to(
                                     self.device)

        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  fc1_units=params['layers_actor'][0],
                                  fc2_units=params['layers_actor'][1]).to(
                                      self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   seed,
                                   fcs1_units=params['layers_critic'][0],
                                   fc2_units=params['layers_critic'][1]).to(
                                       self.device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    seed,
                                    fcs1_units=params['layers_critic'][0],
                                    fc2_units=params['layers_critic'][1]).to(
                                        self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   params['buffer_size'],
                                   params['batch_size'],
                                   seed,
                                   device=self.device)

        # save params
        self.params = params

    def _fill_params(self, src_params):
        params = {
            'name':
            self._get_param_or_default('name', src_params, default_params),
            'buffer_size':
            self._get_param_or_default('buffer_size', src_params,
                                       default_params),
            'batch_size':
            self._get_param_or_default('batch_size', src_params,
                                       default_params),
            'layers_actor':
            self._get_param_or_default('layers_actor', src_params,
                                       default_params),
            'layers_critic':
            self._get_param_or_default('layers_critic', src_params,
                                       default_params),
            'lr_actor':
            self._get_param_or_default('lr_actor', src_params, default_params),
            'lr_critic':
            self._get_param_or_default('lr_critic', src_params,
                                       default_params),
            'gamma':
            self._get_param_or_default('gamma', src_params, default_params),
            'tau':
            self._get_param_or_default('tau', src_params, default_params),
            'weight_decay':
            self._get_param_or_default('weight_decay', src_params,
                                       default_params)
        }
        return params

    def display_params(self, force_print=False):
        if force_print:
            print(self.params)
        return self.params

    def _get_param_or_default(self, key, src_params, default_params):
        if key in src_params:
            return src_params[key]
        else:
            return default_params[key]

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def start_learn(self):
        # Learn, if enough samples are available in memory
        # decoupled from step method to allow multiple steps per learning pass
        if len(self.memory) > self.params['batch_size']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['gamma'])

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.params['tau'])
        self.soft_update(self.actor_local, self.actor_target,
                         self.params['tau'])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 16
0
class DDPGAgent:
    def __init__(self,
                 action_size=4,
                 state_size=33,
                 num_agents=20,
                 max_steps=1000,
                 seed=0,
                 train_mode=True):
        self.train_mode = train_mode
        self.action_size = action_size
        self.state_size = state_size
        self.num_agents = num_agents
        self.max_steps = max_steps

        self.step_count = 0
        self.scores = np.zeros(self.num_agents)
        self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None

        self.noise = OUNoise(self.action_size, seed)
        self.memory = AgentMemory(batch_size=BATCH_SIZE,
                                  buffer_size=MEMORY_BUFFER,
                                  seed=seed)

        self.actor = Actor(self.state_size, self.action_size, seed)
        self.critic = Critic(self.state_size, self.action_size, seed)

        self.target_actor = Actor(self.state_size, self.action_size, seed)
        self.target_critic = Critic(self.state_size, self.action_size, seed)

        self.actor_opt = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        hard_update(self.actor, self.target_actor)
        hard_update(self.critic, self.target_critic)

    def reset(self):
        self.noise.reset()
        self.step_count = 0
        self.scores = np.zeros(self.num_agents)
        self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None

    def step(self):
        self.scores += np.array(self.rewards)
        self.step_count += 1
        self.memory.add(self.states, self.actions, self.rewards,
                        self.next_states, self.dones)

        if self.memory.has_enough_memory():
            for i in range(UPDATE_FREQUENCY_PER_STEP):
                states, actions, rewards, next_states, dones = self.memory.sample(
                )
                self.learn(states, actions, rewards, next_states, dones)
                self.soft_update()

    def act(self, add_noise=True):
        states = array_to_tensor(self.states)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states)
            actions = actions.cpu().data.numpy()
        self.actor.train()

        if add_noise:
            noise = self.noise.sample()
            actions += noise

        actions = np.clip(actions, -1, 1)
        return actions

    def learn(self, states, actions, rewards, next_states, dones):
        # Update critic
        self.critic_opt.zero_grad()
        critic_loss = ddpg_compute_critic_loss(states, actions, rewards,
                                               next_states, dones,
                                               self.target_actor,
                                               self.target_critic, self.critic)
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_opt.step()

        # Update actor
        self.actor_opt.zero_grad()
        actor_loss = ddpg_compute_actor_loss(states, self.actor, self.critic)
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        self.actor_opt.step()

        # Update target nets
        self.soft_update()

    def soft_update(self):
        soft_update(self.actor, self.target_actor, TAU)
        soft_update(self.critic, self.target_critic, TAU)