def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class soft_actor_critic_agent(object):
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, seed, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device 
        self.seed = seed
        self.seed = torch.manual_seed(seed)
        
        torch.cuda.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)
        #torch.backends.cudnn.deterministic=True

        self.critic = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        
        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
            Q1_next_target, Q2_next_target = self.critic_target(next_state_batch, next_state_action)
            min_q_next_target = torch.min(Q1_next_target, Q2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_q_next_target)

        # Two Q-functions to mitigate positive bias in the policy improvement step
        Q1, Q2 = self.critic(state_batch, action_batch) 
        Q1_loss = F.mse_loss(Q1, next_q_value) 
        Q2_loss = F.mse_loss(Q2, next_q_value) 
  
        action_batch_pi, log_pi, _ = self.policy.sample(state_batch)

        Q1_pi, Q2_pi = self.critic(state_batch, action_batch_pi)
        min_q_pi = torch.min(Q1_pi, Q2_pi)

        policy_loss = ((self.alpha * log_pi) - min_q_pi).mean() 
        
        self.critic_optim.zero_grad()
        Q1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        Q2_loss.backward()
        self.critic_optim.step()
        
        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone() # For TensorboardX logs
        
        soft_update(self.critic_target, self.critic, self.tau)    
Ejemplo n.º 4
0
class AbstractAgent(metaclass=ABCMeta):
    """Abstract Base Agent"""
    def __init__(self, state_size, action_size, memory, seed, configs):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.lr = configs['LR']
        self.update_every = configs['UPDATE_EVERY']
        self.batch_size = configs['BATCH_SIZE']
        self.gamma = configs['GAMMA']
        self.tau = configs['TAU']

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        # ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.memory = memory
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Loss
        self.criterion = nn.MSELoss()

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self._learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    @abstractmethod
    def _learn(self, experiences, gamma):
        raise NotImplementedError
Ejemplo n.º 5
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
Ejemplo n.º 6
0
class SAC(object):
    def __init__(self, input_space, action_space, args):

        self.use_expert = args.use_expert
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [action_space.low, action_space.high]
        self.policy_type = args.policy

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        # self.device = torch.device("cuda" if args.cuda else "cpu")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # print(torch.cuda.is_available())
        # print(torch.cuda.current_device())
        # print(torch.cuda.device(0))
        # print(torch.cuda.device_count())
        # print(torch.cuda.get_device_name())
        # print(torch.backends.cudnn.version())
        # print(torch.backends.cudnn.is_available())

        self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            raise ValueError("Not supper another type yet.")

        # SAC_V
        # self.value = ValueNetwork(input_space).to(device=self.device)
        # self.value_target = ValueNetwork(input_space).to(self.device)
        # self.value_optim = Adam(self.value.parameters(), lr=args.lr)
        # hard_update(self.value_target, self.value)

        # self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size).to(self.device)
        # self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, eval=False):
        state_map = torch.FloatTensor(state['map']).to(self.device).unsqueeze(0)
        state_lidar = torch.FloatTensor(state['lidar']).to(self.device).unsqueeze(0)
        state_goal = torch.FloatTensor(state['goal']).to(self.device).unsqueeze(0)
        state_plan_len = torch.FloatTensor(state['plan_len']).to(self.device).unsqueeze(0)
        state_robot_info= torch.FloatTensor(state['robot_info']).to(self.device).unsqueeze(0)
        state = {'map': state_map, 'lidar': state_lidar, 'goal': state_goal, 'plan_len': state_plan_len, 'robot_info': state_robot_info}
        if eval is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        action = action.detach().cpu().numpy()[0]
        # return self.rescale_action(action)
        return action

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
                (self.action_range[1] + self.action_range[0]) / 2.0

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        if not self.use_expert:
            state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)
        else:
            state_batch, action_batch, reward_batch, next_state_batch, mask_batch, s_e_batch, a_e_batch = memory.sample(batch_size=batch_size, use_expert=True)

        # State is array of dictionary like [{"map":value, "lidar":value, "goal":value}, ...]
        # So, convert list to dict below:
        _state_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]}
        _next_state_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]}
        _state_expert_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]}
        for s in state_batch:
            _state_batch['map'].append(s['map'])
            _state_batch['lidar'].append(s['lidar'])
            _state_batch['goal'].append(s['goal'])
            _state_batch['plan_len'].append(s['plan_len'])
            _state_batch['robot_info'].append(s['robot_info'])

        for s in next_state_batch:
            _next_state_batch['map'].append(s['map'])
            _next_state_batch['lidar'].append(s['lidar'])
            _next_state_batch['goal'].append(s['goal'])
            _next_state_batch['plan_len'].append(s['plan_len'])
            _next_state_batch['robot_info'].append(s['robot_info'])

        if self.use_expert:
            for s in s_e_batch:
                _state_expert_batch['map'].append(s['map'])
                _state_expert_batch['lidar'].append(s['lidar'])
                _state_expert_batch['goal'].append(s['goal'])
                _state_expert_batch['plan_len'].append(s['plan_len'])
                _state_expert_batch['robot_info'].append(s['robot_info'])

        _state_batch['map'] = torch.FloatTensor(_state_batch['map']).to(self.device)
        _state_batch['lidar'] = torch.FloatTensor(_state_batch['lidar']).to(self.device)
        _state_batch['goal'] = torch.FloatTensor(_state_batch['goal']).to(self.device)
        _state_batch['plan_len'] = torch.FloatTensor(_state_batch['plan_len']).to(self.device)
        _state_batch['robot_info'] = torch.FloatTensor(_state_batch['robot_info']).to(self.device)
        _next_state_batch['map'] = torch.FloatTensor(_next_state_batch['map']).to(self.device)
        _next_state_batch['lidar'] = torch.FloatTensor(_next_state_batch['lidar']).to(self.device)
        _next_state_batch['goal'] = torch.FloatTensor(_next_state_batch['goal']).to(self.device)
        _next_state_batch['plan_len'] = torch.FloatTensor(_next_state_batch['plan_len']).to(self.device)
        _next_state_batch['robot_info'] = torch.FloatTensor(_next_state_batch['robot_info']).to(self.device)
        if self.use_expert:
            _state_expert_batch['map'] = torch.FloatTensor(_state_expert_batch['map']).to(self.device)
            _state_expert_batch['lidar'] = torch.FloatTensor(_state_expert_batch['lidar']).to(self.device)
            _state_expert_batch['goal'] = torch.FloatTensor(_state_expert_batch['goal']).to(self.device)
            _state_expert_batch['plan_len'] = torch.FloatTensor(_state_expert_batch['plan_len']).to(self.device)
            _state_expert_batch['robot_info'] = torch.FloatTensor(_state_expert_batch['robot_info']).to(self.device)
            _action_expert_batch = torch.FloatTensor(a_e_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        # SAC_V
        # with torch.no_grad():
        #     vf_next_target = self.value_target(_new_next_state_batch)
        #     next_q_value = reward_batch + mask_batch * self.gamma * (vf_next_target)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(_next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(_next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
        qf1, qf2 = self.critic(_state_batch, action_batch)  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(qf1, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(qf2, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        # Update Policy
        if not self.use_expert:
            pi, log_pi, _ = self.policy.sample(_state_batch)

            qf1_pi, qf2_pi = self.critic(_state_batch, pi)
            min_qf_pi = torch.min(qf1_pi, qf2_pi)
        else:
            pi, log_pi, _ = self.policy.sample(_state_expert_batch)

            qf1_pi, qf2_pi = self.critic(_state_expert_batch, pi)
            min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone() # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs


        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()

        # # SAC_V
        # # Regularization Loss
        # reg_loss = 0.001 * (mean.pow(2).mean() + log_std.pow(2).mean())
        # policy_loss += reg_loss

        # self.policy_optim.zero_grad()
        # policy_loss.backward()
        # self.policy_optim.step()

        # # Update Value
        # if not self.use_expert:
        #     vf = self.value(_new_state_batch)
        # else:
        #     vf = self.value(_new_s_e_batch)
        
        # with torch.no_grad():
        #     vf_target = min_qf_pi - (self.alpha * log_pi)

        # vf_loss = F.mse_loss(vf, vf_target) # JV = 𝔼(st)~D[0.5(V(st) - (𝔼at~π[Q(st,at) - α * logπ(at|st)]))^2]

        # self.value_optim.zero_grad()
        # vf_loss.backward()
        # self.value_optim.step()

        # if updates % self.target_update_interval == 0:
        #     soft_update(self.value_target, self.value, self.tau)

        # return vf_loss.item(), qf1_loss.item(), qf2_loss.item(), policy_loss.item()

    # Save model parameters
    def save_model(self, env_name, suffix="", actor_path=None, critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to\n {}\n, {}\n'.format(actor_path, critic_path))

        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from\n {}\n, {}\n'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
Ejemplo n.º 7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, compute_weights=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.compute_weights = compute_weights

        # Algorithms to enable during training
        self.PrioritizedReplayBuffer = True  # Use False to enable uniform sampling
        self.HardTargetUpdate = True  # Use False to enable soft target update

        # building the policy and target Q-networks for the agent, such that the target Q-network is kept frozen to avoid the training instability issues
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)  # main policy network
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)  # target network
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.criterion = nn.MSELoss()

        # Replay memory
        # building the experience replay memory used to avoid training instability issues
        # Below: PER
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, seed,
                                   compute_weights)

        # Below: Uniform by method defined in this script
        #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY
        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                sampling = self.memory.sample()
                self.learn(sampling, GAMMA)
        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, state, eps=0.):
        """A function to select an action based on the Epsilon greedy policy. Epislon percent of times the agent will select a random
        action while 1-Epsilon percent of the time the agent will select the action with the highest Q value as predicted by the
        neural network.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # here we calculate action values (Q values)
        self.qnetwork_local.eval(
        )  # model deactivate norm, dropout etc. layers as it is expected
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train(
        )  # model.train() sets the modules in the network in training mode

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.cpu().numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, sampling, gamma):
        """Update value parameters using given batch of experience tuples.
        Function for training the neural network. The function will update the weights of the newtwork

        Params
        ======
            sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, weights, indices = sampling

        # Target (absolute) Q values from target Q network
        q_target = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Predictions from local Q network
        expected_values = rewards + gamma * q_target * (1 - dones)
        output = self.qnetwork_local(states).gather(1, actions)
        # computing the loss
        loss = F.mse_loss(output,
                          expected_values)  # Loss Function: Mean Square Error
        if self.compute_weights:
            with torch.no_grad():
                weight = sum(np.multiply(weights, loss.data.cpu().numpy()))
            loss *= weight
        # Minimizing the loss by optimizer
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        # ------------------- update priorities ------------------- #
        delta = abs(expected_values - output.detach()).cpu().numpy()
        #print("delta", delta)
        self.memory.update_priorities(delta, indices)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    # def hard_update(self):
    # """ This hard_update method performs direct update of target network
    # weight update from local network weights instantly"""

    # Write the algorithm here

    def load_models(self, policy_net_filename, target_net_filename):
        """ Function to load the parameters of the policy and target models """
        print('Loading model...')
        self.qnetwork_local.load_model(policy_net_filename)
        self.qnetwork_target.load_model(target_net_filename)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # use helper calc_loss function & not this directly
        self.criterion = nn.MSELoss()

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # first unsqueeze to make it a batch with one sample in it
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        # set the mode back to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
        # output of model needs to be Q values of actions 0 - (n-1) And output[ind_x] needs to correspond to action_x
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    # helper for learn func
    # y_j does not depend on the weight parameters that gradient descent will be training
    def calc_y_j(self, r_j, dones, gamma, target_out):
        # 1 or 0 flag; if episode doesn't terminate at j+1 (aka done == False), y_j product now already includes the gamma multiplication factor
        # use [[x] for x in y] kind of list comprehension because need them to be batch_size by 1 like r_j
        # use .to(device) to move to gpu (not just setting device arg when creating a tensor)
        dones_flags = torch.Tensor([[0] if done == True else [gamma] for done in dones]).float().to(device)
        max_q_target_out = torch.Tensor([[torch.max(q_for_all_actions)] for q_for_all_actions in target_out]).float().to(device)
        
        #  RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead.
        #dones_flags = torch.from_numpy(np.vstack([0 if done == True else gamma for done in dones])).float().to(device)
        #max_q_target_out = torch.from_numpy(np.vstack([torch.max(q_for_all_actions) for q_for_all_actions in target_out])).float().to(device)
        
        y_j = r_j + dones_flags * max_q_target_out
        return y_j
    
    # helper for learn func
    def calc_loss(self, y_j, pred_out, actions):
        # need pred_out_actions_taken to be a tensor & built by combining (concatenating) other tensors to maintain gradients
        # actions is batch_size by 1- only have to iterate through rows
        for i in range(actions.shape[0]):
            # action taken. is an index for which col to look at in pred_out (pred_out is batch_size by n_actions]
            action_ind = actions[i, 0].item()
            if i == 0:
                # need to take h from 0 dimensional to 2 dimensional
                pred_out_actions_taken = pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)
            else: 
                # concat along dim 0 -> vertically stack rows
                pred_out_actions_taken = torch.cat((pred_out_actions_taken, pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)), dim=0)
            
        # loss is MSE between pred_out_actions_taken (input) and y_j (target) 
        return self.criterion(pred_out_actions_taken, y_j)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # torch
        states, actions, rewards, next_states, dones = experiences

        "*** YOUR CODE HERE ***"
        ## TODO: compute and minimize the loss
        # vstack takes one argument (sequence)- stacks the pieces of that argument vertically
        # SELF NOTE: think about what (if anything additional) needs .todevice()
        
        # make sure to zero the gradients
        self.optimizer.zero_grad()
        
        # q_network_local model output from forward pass
        pred_out = self.qnetwork_local(states)
        target_out = self.qnetwork_target(next_states)
        
        # compute the loss for q_network_local vs q_network_target 
        y_j = self.calc_y_j(rewards, dones, gamma, target_out)
        # calc gradient & take step down the gradient
        loss = self.calc_loss(y_j, pred_out, actions)
        loss.backward()
        self.optimizer.step()
                      

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    # save sample (error,<s,a,r,s'>) to the replay memory
    def _append_sample(self, state, action, reward, next_state, done):

        state_on_device = torch.from_numpy(state).float().unsqueeze(0).to(
            device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state_on_device)
            expected_Q = action_values[(0, action)]
        self.qnetwork_local.train()

        next_state_on_device = torch.from_numpy(next_state).float().unsqueeze(
            0).to(device)
        self.qnetwork_target.eval()
        with torch.no_grad():
            predicted_Q = self.qnetwork_target(next_state_on_device)
            max_predicted_Q = predicted_Q.max(1)[0]
        self.qnetwork_target.train()

        target_Q = reward + (GAMMA * max_predicted_Q * (1 - done))
        error = expected_Q - target_Q
        error = abs(error.cpu().data.numpy()[0])

        self.memory.add(state, action, reward, next_state, done)

        # Simplified Prioritized Experience Replay
        # add additional samples if the error is large
        if error > 1.:
            error = pow(error, 0.6)
        count = int(round(error, 0))
        for _ in range(count):
            self.memory.add(state, action, reward, next_state, done)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        if USE_SPER:
            self._append_sample(state, action, reward, next_state, done)
        else:
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # get the expected Q value using the local model
        expected_Q = self.qnetwork_local(states).gather(1, actions)

        if USE_DDQN:
            # ----- DDQN
            local_next_state_max_action = self.qnetwork_local(
                next_states).detach().argmax(1).unsqueeze(1)
            max_predicted_Q = self.qnetwork_target(
                next_states).detach().gather(1, local_next_state_max_action)
        else:
            # ---- Vanilla DQN
            max_predicted_Q = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # get the target Q using current state
        target_Q = rewards + (gamma * max_predicted_Q * (1 - dones))

        # get the loss
        loss = F.mse_loss(expected_Q, target_Q)

        # minimise the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 10
0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3 , lr=5e-4, double_dqn=True, huber_loss=False, update_every=4):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn

        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every

        if huber_loss:
            self.loss_function = F.smooth_l1_loss
        else:
            self.loss_function = F.mse_loss

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        """
        states, actions, rewards, next_states, dones = experiences
        if self.double_dqn:
            # Get predicted Q values from target model using the arg_max predicted by a local model
            argmax_actions = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, argmax_actions)
        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss =  self.loss_function(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)                     

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())  #expliding
        else:
            return random.choice(np.arange(self.action_size))  #exploration

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        """
        for i in range(BATCH_SIZE):
            if not dones[i]:
                max_val = self.qnetwork_target(next_states[i])
                best_val = max_val.argmax()
                target = rewards[i] + gamma*(max_val[best_val])
            else:
                target = rewards[i]
            current = self.qnetwork_local(states[i])[actions[i]]
            #current = self.qnetwork_local(states).gather(-1, actions.reshape(actions.size()[0], 1))
            self.loss = F.mse_loss(target, current)
            #self.loss.requires_grad = True
            self.optimizer.zero_grad()
            self.loss.backward()
            self.optimizer.step()
        """
        max_val = self.qnetwork_target(next_states)
        best_val = max_val.argmax(dim=-1)
        max_val = max_val.gather(-1, best_val.reshape(max_val.size()[0], 1))
        target = rewards + gamma * max_val * (1 - dones)
        current = self.qnetwork_local(states).gather(
            -1, actions.reshape(actions.size()[0], 1))
        self.loss = F.mse_loss(current, target)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 12
0
class Agent():

    """
    Initialize Agent, inclduing:
        DQN Hyperparameters
        Local and Targat State-Action Policy Networks
        Replay Memory Buffer from Replay Buffer Class (define below)
    """
    def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99,
    	learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0):
        
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)

        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0


    ########################################################
    # STEP() method
    #
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)


	########################################################
    # ACT() method
    #
    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))


	########################################################
    # LEARN() method
    # Update value parameters using given batch of experience tuples.
    def learn(self, experiences, gamma, DQN=True):
        
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)


        if (self.dqn_type == 'DDQN'):
        #Double DQN
        #************************
            Qsa_prime_actions = self.network(next_states).detach().max(1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
        #Regular (Vanilla) DQN
        #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)        

        
        # Compute Q targets for current states 
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))
        
        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)


    ########################################################
    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """
    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """ Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """ Initialize an Agent object.
        
            INPUTS: 
            ------------
                state_size - (int) dimension of each state
                action_size - (int) dimension of each action
                seed - (int) random seed
            
            OUTPUTS:
            ------------
                no direct
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.
        
            INPUTS: 
            ------------
                state - (array_like) the previous state of the environment (8,)
                action - (int) the agent's previous choice of action
                reward - (float) last reward received
                next_state - (torch tensor) the current state of the environment
                done - (bool) whether the episode is complete (True or False)
            
            OUTPUTS:
            ------------
                no direct
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                #print('experiences')
                #print(experiences)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """ Returns actions for given state as per current policy.
        
            INPUTS:
            ------------
                state - (numpy array_like) current state
                eps - (float) epsilon, for epsilon-greedy action selection

            OUTPUTS:
            ------------
                act_select - (int) next epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            act_select = np.argmax(action_values.cpu().data.numpy())
            return act_select
        else:
            act_select = random.choice(np.arange(self.action_size))
            return act_select

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.

            INPUTS:
            ------------
                experiences - (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples 
                gamma - (float) discount factor

            OUTPUTS:
            ------------
        """
        states, actions, rewards, next_states, dones = experiences

        ## Compute and minimize the loss

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        #print(Q_targets_next)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        #print(Q_targets)

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        #print(Q_expected)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            INPUTS:
            ------------
                local_model - (PyTorch model) weights will be copied from
                target_model - (PyTorch model) weights will be copied to
                tau - (float) interpolation parameter 
                
            OUTPUTS:
            ------------
                no direct
                
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 14
0
class Agent():
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, random_seed)
        self.qnetwork_target = QNetwork(state_size, action_size, random_seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Move to GPU if CUDA is available
        if train_on_gpu:
            self.qnetwork_local.cuda()
            self.qnetwork_target.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0)
        if train_on_gpu:
            state = state.cuda()
        self.qnetwork_local.eval()
        action_values = self.qnetwork_local(Variable(state, volatile=True))
        self.qnetwork_local.train()
        max_action = np.argmax(action_values.cpu().data.numpy())
        policy_s = np.ones(self.action_size) * eps / self.action_size
        policy_s[max_action] = 1 - eps + (eps / self.action_size)
        action = np.random.choice(np.arange(self.action_size), p=policy_s)
        return action

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute loss
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----------------------- update target network ----------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 15
0
class Double_DQNAgent(object):
    """Interacts with and learns from the environment."""
    def __init__(self, args, state_size, action_size):
        """Initialize an Agent object.

        Args:
            param1  (args)  : command line arguments
            param2  (int) : state size environment
            param3  (int)   : dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(args.seed)
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.update_every = args.update_every
        self.memory_size = args.buffer_size
        self.gamma = args.discount
        self.lr = args.lr
        self.device = args.device
        # Q-Network
        if torch.cuda.is_available():
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           args.hidden_size_1,
                                           args.hidden_size_2, args.seed)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            args.hidden_size_1,
                                            args.hidden_size_2, args.seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def act_e_greedy(self, state, epsilon=0.001):
        """ acts with epsilon greedy policy
            epsilon exploration vs exploitation traide off
        Args:
           param1(int): state
           param2(float): epsilon
        Return : action int number between 0 and 4
        """
        return random.choice(np.arange(
            self.action_size)) if np.random.random() < epsilon else self.act(
                state)

    def act(self, state):
        """
        acts greedy(max) based on a single state
        Args:
            param1 (int) : state
        """
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        return np.argmax(action_values.cpu().data.numpy())

    def learn(self, mem):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        idxs, states, actions, rewards, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        states = states.squeeze(1)
        q_values = self.qnetwork_local(states)
        next_q_values = self.qnetwork_target(next_states)

        next_q_values = next_q_values.squeeze(1)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]

        nonterminals = nonterminals.squeeze(1)

        expected_q_value = rewards + (self.gamma * next_q_value * nonterminals)
        loss = F.mse_loss(q_value, expected_q_value)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        loss = (q_value - expected_q_value.detach()).pow(2) * weights
        prios = loss + 1e-5
        mem.update_priorities(idxs,
                              prios.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions

        # ------------------- update target network ------------------- #
        self.soft_update()

    def soft_update(self, tau=1e-3):
        """ swaps the network weights from the online to the target

        Args:
            param1 (float): tau
        """

        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_target_net(self):
        """ copy the model weights from the online to the target network """
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def save(self, path):
        """ save the model weights to a file
        Args:
            param1 (string): pathname
        """
        torch.save(self.qnetwork_local.state_dict(),
                   os.path.join(path, 'model.pth'))

    def train(self):
        """     activates the backprob. layers for the online network """
        self.qnetwork_local.train()

    def eval(self):
        """ invoke the eval from the online network
            deactivates the backprob
            layers like dropout will work in eval model instead
        """
        self.qnetwork_local.eval()
Ejemplo n.º 16
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 behavior_name,
                 index_player,
                 replay_memory_size=1e4,
                 batch_size=100,
                 gamma=0.99,
                 learning_rate=1e-4,
                 target_tau=1e-3,
                 update_rate=10,
                 seed=0):
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0
        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step ( for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_model(self, path_model, path_target=None):
        params = torch.load(path_model)
        #self.network.set_params(params)
        self.network.load_state_dict(torch.load(path_model))
        if path_target != None:
            self.target_network.load_state_dict(torch.load(path_target))

    def choose_action(self, state, eps=0.0):
        eps = -1
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()
                             )  # return a number from 0 to action_size
        else:
            return random.choice(np.arange(
                self.action_size))  # return a number from 0 to action_size

    def Read(self):
        decision_steps, terminal_steps = env.get_steps(self.behavior_name)
        try:
            signal_front = np.array(
                sensor_front_sig(
                    decision_steps.obs[0][self.index_player, :]))  # 3 x 11 x 8
            signal_back = np.array(
                sensor_back_sig(
                    decision_steps.obs[1][self.index_player, :]))  # 3 x 3 x 8
            #pre_state = []
            signal_front = np.array(signal_front)
            #print(signal_front.shape)
            #print(signal_back.shape)
            r = np.concatenate((signal_front, signal_back), axis=1)
            #print(r.shape)
            #input('ff')
            #pre_state.extend(list(np.array(signal_front).flatten()))
            #pre_state.extend(list(np.array(signal_back).flatten()))
            #state = np.array(pre_state)
            self.current_state = r
            count_close_to_ball = 0
            count_touch_ball = 0
            count_back_touch = 0
            count_back_close = 0
            self.rew_d_to_our_post = 0
            self.rew_for_ball_dist = -0.1
            # Front Observation
            for i in range(len(signal_front[0])):
                if signal_front[0][i][0] == 1.0:
                    count_close_to_ball += 1
                    self.rew_for_ball_dist = max(
                        0.3 * (1 - signal_front[0][i][7]),
                        self.rew_for_ball_dist)

                    # Kicked the ball at the front
                    if signal_front[0][i][7] <= 0.03:
                        count_touch_ball += 1

                if signal_front[0][i][1] == 1.0:
                    self.rew_d_to_our_post = -0.1
                if signal_front[0][i][2] == 1.0:
                    self.rew_d_to_our_post = 0.1

            # Back observation
            for i in range(len(signal_back[0])):
                if signal_back[0][i][0] == 1.0:
                    count_back_close += 1

                    # Touches the ball at the back
                    if signal_back[0][i][7] <= 0.03:
                        count_back_touch += 0

            self.back_touch = 1 if count_back_touch > 0 else 0
            self.back_close = 1 if count_back_close > 0 else 0

            # add reward if kick the ball
            self.touch_ball_reward = 3 if count_touch_ball > 0 else 0
            # Penalize for back touching the ball
            if count_back_touch > 0:
                self.touch_ball_reward = -1.5

            # Penalize if the ball is not in view
            self.close_ball_reward = 0.1 if count_close_to_ball > 0 else -0.1
            # Penalize if the ball is behind the agent
            if count_back_close > 0:
                self.close_ball_reward = -0.1

            return self.current_state
        except:
            self.touch_ball_reward = 0
            self.close_ball_reward = 0

        return self.current_state
Ejemplo n.º 17
0
    class Agent():
        """Interacts with and learns from the environment."""
        def __init__(self, state_size, action_size, seed):
            """Initialize an Agent object.
            
            Params
            ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                seed (int): random seed
            """
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            # Q-Network
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

            # Replay memory
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, ALPHA)

            # Initialize time step (for updating every UPDATE_EVERY steps)
            self.t_step = 0

            # Initialize learning step for updating beta
            self.learn_step = 0

        def step(self, state, action, reward, next_state, done):
            # Save experience in replay memory
            self.memory.add(state, action, reward, next_state, done)

            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # If enough samples are available in memory, get prioritized subset and learn
                if len(self.memory) > BATCH_SIZE:
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, BETA)

        def act(self, state, eps=0.):
            """Returns actions for given state as per current policy.
            
            Params
            ======
                state (array_like): current state
                eps (float): epsilon, for epsilon-greedy action selection
            """
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma, beta):
            """Update value parameters using given batch of experience tuples.

            Params
            ======
                experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
                gamma (float): discount factor
                beta (float): initial value for beta, which controls how much importance weights affect learning
            """
            states, actions, rewards, next_states, dones, probabilities, indices = experiences

            if double_dqn:
                # Get the Q values for each next_state, action pair from the
                # local/online/behavior Q network:
                Q_targets_next_local = self.qnetwork_local(
                    next_states).detach()
                # Get the corresponding best action for those next_states:
                _, a_prime = Q_targets_next_local.max(1)

                # Get the Q values from the target Q network but following a_prime,
                # which belongs to the local network, not the target network:
                Q_targets_next = self.qnetwork_target(next_states).detach()
                Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1))

            else:
                # Get max predicted Q values (for next states) from target model
                Q_targets_next = self.qnetwork_target(
                    next_states).detach().max(1)[0].unsqueeze(1)

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            # Compute and update new priorities
            new_priorities = (abs(Q_expected - Q_targets) +
                              EPSILON_PER).detach()
            self.memory.update_priority(new_priorities, indices)

            # Update beta parameter (b). By default beta will reach 1 after
            # 25,000 training steps (~325 episodes in the Banana environment):
            b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS)
            self.learn_step += 1

            # Compute and apply importance sampling weights to TD Errors
            ISweights = (((1 / len(self.memory)) * (1 / probabilities))**b)
            max_ISweight = torch.max(ISweights)
            ISweights /= max_ISweight
            Q_targets *= ISweights
            Q_expected *= ISweights

            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        def soft_update(self, local_model, target_model, tau):
            """Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            Params
            ======
                local_model (PyTorch model): weights will be copied from
                target_model (PyTorch model): weights will be copied to
                tau (float): interpolation parameter 
            """
            for target_param, local_param in zip(target_model.parameters(),
                                                 local_model.parameters()):
                target_param.data.copy_(tau * local_param.data +
                                        (1.0 - tau) * target_param.data)
Ejemplo n.º 18
0
        def save_model(model: QNetwork, output_file: Path) -> None:
            """Save the weights of the trained agent"""

            torch.save(model.state_dict(), output_file)
Ejemplo n.º 19
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, 64,
                                       64).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, 64,
                                        64).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss_fn = torch.nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0


#         torch.nn.utils.clip_grad_value_(self.qnetwork_local.parameters(), clip_value = 1)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        #         # Get max predicted Q values (for next states) from target model
        #         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        #         # Compute Q targets for current states
        #         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #         # Get expected Q values from local model
        #         Q_expected = self.qnetwork_local(states).gather(1, actions)

        #         # Compute loss
        #         loss = F.mse_loss(Q_expected, Q_targets)
        #         # Minimize the loss
        #         self.optimizer.zero_grad()
        #         loss.backward()
        #         self.optimizer.step()

        optimizer = self.optimizer
        loss_fn = self.loss_fn

        ## this is required as we're not learning qnetwork_targets weights
        #         with torch.no_grad():
        #             Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones)
        #             Q_target[dones == True] = rewards[dones == True]
        #         Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1)

        ## Double DQNs
        #argmax on Target W
        best_actions_by_local_nn = torch.max(
            self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1)
        action_values_by_target_nn = self.qnetwork_target(
            next_states).detach().gather(1, best_actions_by_local_nn)
        Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones)

        Q_pred = self.qnetwork_local(states).gather(1, actions)

        optimizer.zero_grad()
        loss = loss_fn(Q_pred, Q_target)
        loss.backward()
        optimizer.step()

        #         print("Loss=", loss.item())
        #         print("Loss=", loss,
        #               "Local params L2=", torch.norm(self.qnetwork_local.parameters(), 2),
        #               "Local params grad L2=", torch.norm(self.qnetwork_local.parameters().grad, 2))

        #         with torch.no_grad():
        #             for param in self.qnetwork_local.parameters():
        #                 param -= learning_rate * param.grad

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDQNAgentPrioExpReplay:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=PARAM.LR)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(action_size, 20000,
                                              PARAM.BATCH_SIZE, 0,
                                              PARAM.PROBABILITY_EXPONENT)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.eps = 1

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY
        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > PARAM.BATCH_SIZE:
                experiences, experience_indices, importance_weights = self.memory.sample(
                )
                self.learn(experiences, experience_indices, importance_weights,
                           PARAM.GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        self.eps = eps
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def get_ddqn_targets(self, next_states, rewards, gamma, dones):
        # get best action according to online value function approximation
        q_online = self.qnetwork_local(next_states).detach()
        q_online = q_online.argmax(1)

        # get value of target function at position of best online action
        q_target = self.qnetwork_target(next_states).detach()
        q_target = q_target.index_select(1, q_online)[:, 0]

        # reshape
        q_target = q_target.unsqueeze(1)

        # calculate more correct q-value given the current reward
        Q_targets = rewards + (gamma * q_target * (1 - dones))

        return Q_targets

    def learn(self, experiences, experience_indices, importance_weights,
              gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences
        Q_targets = self.get_ddqn_targets(next_states, rewards, gamma, dones)

        # Get expected Q values
        q_exp = self.qnetwork_local(states)
        # print(q_exp)

        # gets the q values along dimension 1 according to the actions, which is used as index
        # >>> t = torch.tensor([[1,2],[3,4]])
        # >>> torch.gather(t, 1, torch.tensor([[0],[1]]))
        # tensor([[ 1],
        #        [ 4]])
        q_exp = q_exp.gather(1, actions)
        # print(q_exp)

        error = torch.abs(q_exp - Q_targets)

        with torch.no_grad():
            # update priority
            # we need ".cpu()" here because the values need to be copied to memory before converting them to numpy,
            # else they are just present in the GPU
            errors = np.squeeze(error.cpu().data.numpy())
            self.memory.set_priorities(experience_indices, errors)

        # compute loss
        squared_error = torch.mul(error, error)
        with torch.no_grad():
            w = torch.from_numpy(
                importance_weights**(1 - self.eps)).float().to(device)
            w = w.detach()

        squared_error = torch.squeeze(squared_error)
        weighted_squared_error = torch.mul(squared_error, w)
        loss = torch.mean(weighted_squared_error)
        #loss = F.mse_loss(q_exp, Q_targets)

        # reset optimizer gradient
        self.optimizer.zero_grad()
        # do backpropagation
        loss.backward()
        # do optimize step
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        # according to the algorithm in
        # https://proceedings.neurips.cc/paper/2010/file/091d584fced301b442654dd8c23b3fc9-Paper.pdf
        # one should update randomly in ether direction
        #update_direction = np.random.binomial(1, 0.5)
        #if update_direction == 0:
        #    self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)
        #else:
        #    self.soft_update(self.qnetwork_target, self.qnetwork_local, PARAM.TAU)

        self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 21
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64,
                 gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, 
                 double_dqn=False, dueling_dqn=False, prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            double_dqn (bool) use double Q-network when 'True'
            dueling_dqn (bool): use dueling Q-network when 'True'
            prioritized_replay (bool): use prioritized replay buffer when 'True'
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.double_dqn = double_dqn
        self.dueling_dqn = dueling_dqn
        self.prioritized_replay = prioritized_replay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, prioritized_replay=prioritized_replay)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for a given state as per the current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using a given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.prioritized_replay:
            states, actions, rewards, next_states, dones, indices, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q-values (for next states) from target model
        if self.double_dqn:
            # Use local model to choose an action, and target model to evaluate that action
            Q_local_max = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_local_max)
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q-targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q-values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        if self.prioritized_replay:
            priorities = np.sqrt(loss.detach().cpu().data.numpy())
            self.memory.update_priorities(indices, priorities)
            loss = loss * weights
            loss = loss.mean()
            
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): where weights will be copied from
            target_model (PyTorch model): where weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Ejemplo n.º 22
0
class SAC(object):
    def __init__(self,
                 num_inputs,
                 action_space,
                 args,
                 process_obs=None,
                 opt_level='O1'):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.device = torch.device("cuda" if args.cuda else "cpu")
        self.dtype = torch.float

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.process_obs = process_obs.to(self.device).to(self.dtype)
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device).to(
                                   self.dtype)
        self.critic_optim = Adam(list(self.critic.parameters()) +
                                 list(process_obs.parameters()),
                                 lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device).to(
                                          self.dtype)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device,
                                             dtype=self.dtype)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size, action_space).to(
                                             self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(
                num_inputs, action_space.shape[0], args.hidden_size,
                action_space).to(self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        if opt_level is not None:
            model, optimizer = amp.initialize([
                self.policy, self.process_obs, self.critic, self.critic_target
            ], [self.policy_optim, self.critic_optim],
                                              opt_level=opt_level)

    def select_action(self, obs, evaluate=False):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0).to(
                self.dtype)
            state = self.process_obs(obs)
            if evaluate is False:
                action, _, _ = self.policy.sample(state)
            else:
                _, _, action = self.policy.sample(state)
            action = action.detach().cpu().numpy()[0]
        return action

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        obs_batch, action_batch, reward_batch, next_obs_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        obs_batch = torch.FloatTensor(obs_batch).to(self.device).to(self.dtype)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device).to(
            self.dtype)
        action_batch = torch.FloatTensor(action_batch).to(self.device).to(
            self.dtype)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1).to(self.dtype)
        mask_batch = torch.FloatTensor(mask_batch).to(
            self.device).unsqueeze(1).to(self.dtype)

        state_batch = self.process_obs(obs_batch)
        with torch.no_grad():
            next_state_batch = self.process_obs(next_obs_batch)
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        assert torch.isfinite(qf_loss).all()
        with amp.scale_loss(qf_loss, self.critic_optim) as qf_loss:
            qf_loss.backward()
        self.critic_optim.step()

        state_batch = self.process_obs(obs_batch)
        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch.detach(), pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        assert torch.isfinite(policy_loss).all()
        with amp.scale_loss(policy_loss, self.policy_optim) as policy_loss:
            policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device).to(self.dtype)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   actor_path=None,
                   critic_path=None,
                   process_obs_path=None):
        logger.debug(
            f'saving models to {actor_path} and {critic_path} and {process_obs_path}'
        )
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)
        torch.save(self.process_obs.state_dict(), process_obs_path)

    # Load model parameters
    def load_model(self,
                   actor_path=None,
                   critic_path=None,
                   process_obs_path=None):
        logger.info(
            f'Loading models from {actor_path} and {critic_path} and {process_obs_path}'
        )
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if process_obs_path is not None:
            self.process_obs.load_state_dict(torch.load(process_obs_path))
Ejemplo n.º 23
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, lr_decay=0.9999):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if USE_DUELING_NETWORK:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed, [128, 32],
                                                  [64, 32]).to(device)

            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed, [128, 32],
                                                   [64, 32]).to(device)
            self.qnetwork_target.eval()

        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size,
                                           seed,
                                           fc1_units=128,
                                           fc2_units=32).to(device)

            self.qnetwork_target = QNetwork(state_size,
                                            action_size,
                                            seed,
                                            fc1_units=128,
                                            fc2_units=32).to(device)
            self.qnetwork_target.eval()

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Replay memory
        if USE_PRIORITIZED_REPLAY_BUFFER:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        if USE_DOUBLE_DQN:
            self.qnetwork_local.eval()
            Q_local = self.qnetwork_local(next_states)
            greedy_actions = torch.argmax(Q_local, axis=1).unsqueeze(1)
            self.qnetwork_local.train()

            Q_targets_next = self.qnetwork_target(next_states)
            Q_targets_next = Q_targets_next.gather(1, greedy_actions)
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

            Q_targets_next = Q_targets_next.max(dim=1, keepdim=True)[0]

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if USE_PRIORITIZED_REPLAY_BUFFER:
            Q_targets.sub_(Q_expected)
            Q_targets.squeeze_()
            Q_targets.pow_(2)

            with torch.no_grad():
                td_error = Q_targets.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            Q_targets.mul_(w)
            loss = Q_targets.mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)

        # Mback-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 24
0
    def __init__(self,
                 num_inputs,
                 action_space,
                 args,
                 process_obs=None,
                 opt_level='O1'):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.device = torch.device("cuda" if args.cuda else "cpu")
        self.dtype = torch.float

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.process_obs = process_obs.to(self.device).to(self.dtype)
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device).to(
                                   self.dtype)
        self.critic_optim = Adam(list(self.critic.parameters()) +
                                 list(process_obs.parameters()),
                                 lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device).to(
                                          self.dtype)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device,
                                             dtype=self.dtype)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size, action_space).to(
                                             self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(
                num_inputs, action_space.shape[0], args.hidden_size,
                action_space).to(self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        if opt_level is not None:
            model, optimizer = amp.initialize([
                self.policy, self.process_obs, self.critic, self.critic_target
            ], [self.policy_optim, self.critic_optim],
                                              opt_level=opt_level)
Ejemplo n.º 25
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()  # this change the local net to eval mode
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train(
        )  # this just return the local net back to train mode

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # Get max predicted Q values (for next states) from target model
        target_q_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        """
        # disregard action, get best value! 
        # why so many next states? answer: the qnetwork will return each corresponding next states action, the max will pick from each the           best action
        
        # explanation on detach (https://discuss.pytorch.org/t/detach-no-grad-and-requires-grad/16915/7)
        """
        # Compute Q targets for current states
        target_q = rewards + (gamma * target_q_next * (1 - dones))

        # Get expected Q values from local model
        expected_q = self.qnetwork_local(states).gather(1, actions)
        """
        this uses gather instead of detach like target since it only give a s*** to action taken
        # explanation on gather (https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms)
        """
        # Compute loss
        loss = F.mse_loss(expected_q, target_q)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 26
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            net = nn.DataParallel(self.qnetwork_local)

        if torch.cuda.is_available():
            print("using GPUs!")
            net.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # target net update
        # Get max predicted Q values (for next states) from target model

        # double means two w, Compute Q targets for current states
        double_Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # choose the index of the max Q, get the action of policy
        best_actions = np.argmax(double_Q_targets_next, dim=1, keepdim=True)
        # implement best action to obtain Q(S_t+1, Q_argmax(S_t+1, a, w), w^-)
        Q_target_next = self.qnetwork_local(states).gather(
            1, best_actions).detach()
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        #logger.info('mse: {}'.format(delta))

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()  # evolutionary step - increase survival chances
        #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards())))

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 28
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning == True:
            self.target_entropy = -torch.prod(torch.Tensor(
                action_space.shape)).item()
            self.log_alpha = torch.zeros(1, requires_grad=True)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
        else:
            pass

        if self.policy_type == "Gaussian":
            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _, _, _ = self.policy.evaluate(state)
        else:
            self.policy.eval()
            _, _, _, action, _ = self.policy.evaluate(state)

        #action = torch.tanh(action)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch)
        mask_batch = torch.FloatTensor(np.float32(mask_batch))

        reward_batch = reward_batch.unsqueeze(
            1)  # reward_batch = [batch_size, 1]
        mask_batch = mask_batch.unsqueeze(1)  # mask_batch = [batch_size, 1]
        """
        Use two Q-functions to mitigate positive bias in the policy improvement step that is known
        to degrade performance of value based methods. Two Q-functions also significantly speed
        up training, especially on harder task.
        """
        expected_q1_value, expected_q2_value = self.critic(
            state_batch, action_batch)
        new_action, log_prob, _, mean, log_std = self.policy.evaluate(
            state_batch)

        if self.automatic_entropy_tuning:
            """
            Alpha Loss
            """
            alpha_loss = -(self.log_alpha *
                           (log_prob + self.target_entropy).detach()).mean()
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()
            self.alpha = self.log_alpha.exp()
            alpha_logs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.)
            alpha_logs = self.alpha  # For TensorboardX logs

        if self.policy_type == "Gaussian":
            """
            Including a separate function approximator for the soft value can stabilize training.
            """
            expected_value = self.value(state_batch)
            target_value = self.value_target(next_state_batch)
            next_q_value = reward_batch + mask_batch * self.gamma * target_value
        else:
            """
            There is no need in principle to include a separate function approximator for the state value.
            We use a target critic network for deterministic policy and eradicate the value value network completely.
            """
            next_state_action, _, _, _, _, = self.policy.evaluate(
                next_state_batch)
            target_critic_1, target_critic_2 = self.critic_target(
                next_state_batch, next_state_action)
            target_critic = torch.min(target_critic_1, target_critic_2)
            next_q_value = reward_batch + mask_batch * self.gamma * target_critic
        """
        Soft Q-function parameters can be trained to minimize the soft Bellman residual
        JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
        """
        q1_value_loss = F.mse_loss(expected_q1_value, next_q_value.detach())
        q2_value_loss = F.mse_loss(expected_q2_value, next_q_value.detach())
        q1_new, q2_new = self.critic(state_batch, new_action)
        expected_new_q_value = torch.min(q1_new, q2_new)

        if self.policy_type == "Gaussian":
            """
            Including a separate function approximator for the soft value can stabilize training and is convenient to 
            train simultaneously with the other networks
            Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
            JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]
            ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st)))
            """
            next_value = expected_new_q_value - (self.alpha * log_prob)
            value_loss = F.mse_loss(expected_value, next_value.detach())
        else:
            pass
        """
        Reparameterization trick is used to get a low variance estimator
        f(εt;st) = action sampled from the policy
        εt is an input noise vector, sampled from some fixed distribution
        Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
        ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st)
        """
        policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean()

        # Regularization Loss
        mean_loss = 0.001 * mean.pow(2).mean()
        std_loss = 0.001 * log_std.pow(2).mean()

        policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        q1_value_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        q2_value_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
Ejemplo n.º 29
0
class BEARQL(object):
    def __init__(self, num_inputs, action_space, args):
        self.gamma = args.gamma
        self.tau = args.tau

        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device)
        hard_update(self.critic_target, self.critic)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device)
        hard_update(self.policy_target, self.policy)

        # dual_lambda
        self.dual_lambda = args.init_dual_lambda
        self.dual_step_size = args.dual_step_size
        self.cost_epsilon = args.cost_epsilon

        # coefficient_weight assigned to ensemble variance term
        self.coefficient_weight = args.coefficient_weight

        self.dual_grad_times = args.dual_grad_times

    # used in evaluation
    def select_action(self, state):
        # sample multiple policies and perform a greedy maximization of Q over these policies
        with torch.no_grad():
            state = torch.FloatTensor(state.reshape(1, -1)).repeat(10, 1).to(device)
            # state = torch.FloatTensor(state.reshape(1, -1)).to(device)
            action, _, mean = self.policy.sample(state)
            # q1, q2 = self.critic(state, action)
            q1, q2, q3 = self.critic(state, action)
            ind = q1.max(0)[1]
        return action[ind].cpu().data.numpy().flatten()
        # return action.cpu().data.numpy().flatten()

    # MMD functions
    def compute_kernel(self, x, y, sigma):
        batch_size = x.shape[0]
        x_size = x.shape[1]
        y_size = y.shape[1]
        dim = x.shape[2]
        tiled_x = x.view(batch_size, x_size, 1, dim).repeat([1, 1, y_size, 1])
        tiled_y = y.view(batch_size, 1, y_size, dim).repeat([1, x_size, 1, 1])
        return torch.exp(-(tiled_x - tiled_y).pow(2).sum(dim=3) / (2 * sigma))

    def compute_mmd(self, x, y, sigma=20.):
        x_kernel = self.compute_kernel(x, x, sigma)
        y_kernel = self.compute_kernel(y, y, sigma)
        xy_kernel = self.compute_kernel(x, y, sigma)
        square_mmd = x_kernel.mean((1, 2)) + y_kernel.mean((1, 2)) - 2 * xy_kernel.mean((1, 2))
        return square_mmd

    def train(self, prior, memory, batch_size, m=4, n=4):

        # Sample replay buffer / batch
        state_np, action_np, reward_np, next_state_np, mask_np = memory.sample(batch_size=batch_size)
        state_batch = torch.FloatTensor(state_np).to(device)
        next_state_batch = torch.FloatTensor(next_state_np).to(device)
        action_batch = torch.FloatTensor(action_np).to(device)
        reward_batch = torch.FloatTensor(reward_np).to(device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_np).to(device).unsqueeze(1)

        # Critic Training
        with torch.no_grad():

            # Duplicate state 10 times
            next_state_rep = torch.FloatTensor(np.repeat(next_state_np, 10, axis=0)).to(device)

            # Soft Clipped Double Q-learning
            next_state_action, _, _ = self.policy_target.sample(next_state_rep)
            target_Q1, target_Q2, target_Q3 = self.critic_target(next_state_rep, next_state_action)
            target_cat = torch.cat([target_Q1, target_Q2, target_Q3], 1)
            target_Q = 0.75 * target_cat.min(1)[0] + 0.25 * target_cat.max(1)[0]
            target_Q = target_Q.view(batch_size, -1).max(1)[0].view(-1, 1)

            next_q_value = reward_batch + mask_batch * self.gamma * target_Q

        qf1, qf2, qf3 = self.critic(state_batch, action_batch)  # ensemble of k Q-functions
        q_loss = F.mse_loss(qf1, next_q_value) + F.mse_loss(qf2, next_q_value) + F.mse_loss(qf3, next_q_value)

        self.critic_optim.zero_grad()
        q_loss.backward()
        self.critic_optim.step()

        # Actor Training
        with torch.no_grad():
            state_rep_m = torch.FloatTensor(np.repeat(state_np, m, axis=0)).to(device)
            state_rep_n = torch.FloatTensor(np.repeat(state_np, n, axis=0)).to(device)

        for i in range(self.dual_grad_times):
            prior_a_rep, _, _ = prior.sample(state_rep_n)
            prior_a_rep = prior_a_rep.view(batch_size, n, -1)
            pi_rep, _, _ = self.policy.sample(state_rep_m)
            pi_rep = pi_rep.view(batch_size, m, -1)
            mmd_dist = self.compute_mmd(prior_a_rep, pi_rep)

            pi, _, _ = self.policy.sample(state_batch)
            qf1_pi, qf2_pi, qf3_pi = self.critic(state_batch, pi)
            qf_cat = torch.cat([qf1_pi, qf2_pi, qf3_pi], 1)
            # min_qf_pi = torch.min(qf1_pi, qf2_pi)  # used in TD3
            # use conservative estimate of Q as used in BEAR
            qf_mean = qf_cat.mean(1)
            qf_var = qf_cat.var(1)
            min_qf_pi = qf_mean - self.coefficient_weight * qf_var.sqrt()  # used in BEAR

            policy_loss = -(min_qf_pi - self.dual_lambda*mmd_dist).mean()

            self.policy_optim.zero_grad()
            policy_loss.backward()
            self.policy_optim.step()

            # Dual Lambda Training
            self.dual_gradients = mmd_dist.mean().item() - self.cost_epsilon
            self.dual_lambda += self.dual_step_size * self.dual_gradients
            self.dual_lambda = np.clip(self.dual_lambda, np.power(np.e, -5), np.power(np.e, 10))

        # Update Target Networks
        soft_update(self.critic_target, self.critic, self.tau)
        soft_update(self.policy_target, self.policy, self.tau)

        return q_loss.item(), policy_loss.item(), self.dual_lambda, mmd_dist.mean().item()

    # Save model parameters
    def save_model(self, env_name, suffix="", actor_path=None, critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/BEAR_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/BEAR_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
Ejemplo n.º 30
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, checkpoint_path=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        # Load the model only when the checkpoint is available
        if checkpoint_path is not None:
            self.qnetwork_local.load_state_dict(torch.load(checkpoint_path))
            print("Checkpoint loaded successfully")

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        self.optimizer.zero_grad()
        #target
        with torch.no_grad():
            #Double DQN
            ddqn_max_indices = self.qnetwork_local(next_states).max(dim=1)[1]
            target_op = self.qnetwork_target(next_states)
            target_op = target_op.gather(1, ddqn_max_indices.view(-1, 1))
            '''
            # DQN
                target_op = self.qnetwork_target(next_states).max(dim=1)[0].view(-1,1)
            '''
            targets = rewards + target_op * (1 - dones) * gamma

        predictions = self.qnetwork_local(states)
        predictions = predictions.gather(1, actions.view(-1, 1))
        loss = torch.nn.MSELoss()(predictions, targets)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 31
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_layers,
                 buffer_size=int(1e6),
                 batch_size=32,
                 gamma=.99,
                 tau=1,
                 lr=2.5e-4,
                 update_local=4,
                 update_target=10000,
                 ddqn=False,
                 seed=1):
        """Initialize Agent object

        Params
        ======
            state_size (int): Dimension of states
            action_size (int): Dimension of actions
            hidden_layers (list of ints): number of nodes in the hidden layers
            buffer_size (int): size of replay buffer
            batch_size (int): size of sample
            gamma (float): discount factor
            tau (float): (soft) update of target parameters
            lr (float): learning rate
            update_local (int): update local after every x steps
            update_target (int): update target after every x steps
            ddqn (boolean): Double Deep Q-Learning
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Hyperparameters
        self.buffer_size = buffer_size  # replay buffer
        self.batch_size = batch_size  # minibatch size
        self.gamma = gamma  # discount factor
        self.tau = tau  # (soft) update of target parameters
        self.lr = lr  # learning rate
        self.update_local = update_local  # update local network after every x steps
        self.update_target = update_target  # update target network with local network weights

        # Q Network
        self.qnet_local = \
            QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnet_target = \
            QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=lr)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

        # Initialize time step
        self.t_step = 0

        # Double Deep Q-Learning flag
        self.ddqn = ddqn

    def step(self, state, action, reward, next_state, done):

        # Save experience in replay buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE LOCAL time steps
        self.t_step += 1
        if self.t_step % self.update_local == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                sample = self.memory.sample()
                if self.t_step % self.update_target == 0:
                    do_target_update = True
                else:
                    do_target_update = False
                self.__learn(sample, self.gamma, do_target_update)

    def act(self, state, epsilon=0):
        """Returns action given a state according to local Q Network (current policy)

        Params
        ======
            state (array_like): current state
            epsilon (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnet_local.eval()
        with torch.no_grad():
            action_values = self.qnet_local(state)
        self.qnet_local.train()

        # Epsilon greedy action selection
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def __learn(self, sample, gamma, do_target_update):
        """Update value parameters using given batch of sampled experiences tuples

        Params
        ======
            sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = sample

        if not self.ddqn:

            # Get max predicted Q values (for next states) from target model
            Q_targets_next = \
                self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)

        else:
            # Get actions (for next states) with max Q values from local net
            next_actions = \
                self.qnet_local(next_states).detach().max(1)[1].unsqueeze(1)

            # Get predicted Q values from target model
            Q_targets_next = \
                self.qnet_target(next_states).gather(1, next_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnet_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if do_target_update:
            self.__target_net_update(self.qnet_local, self.qnet_target,
                                     self.tau)

    def __target_net_update(self, local_net, target_net, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param \
            in zip(target_net.parameters(), local_net.parameters()):
            target_param.data.\
                copy_(tau*local_param.data + (1.0 - tau)*target_param.data)

    def get_info(self):
        output = """
            Replay Buffer size: {} \n
            Batch size: {} \n
            Discout factor: {} \n
            tau: {} \n
            Learning Rate: {} \n
            Update local network after every {} steps \n
            Update target network with local network parameters after every {} steps \n
            DDQN: {}
        """
        print(
            output.format(self.buffer_size, self.batch_size, self.gamma,
                          self.tau, self.lr, self.update_local,
                          self.update_target, self.ddqn))
Ejemplo n.º 32
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        
        for i in range(len(experiences)): # loop through all experiences in batch
            self.optimizer.zero_grad()    # zero out gradient
            if dones[i]:
                y = rewards[i]            # if the experience was the end of the episode y = this reward
            else:
                # if the experience is not the end of the episode
                # y = immediate reward + gamma*(reward of best action in next state)
                y = rewards[i] + gamma*(max(self.qnetwork_target(next_states[i])))
            q = self.qnetwork_local(states[i])  # determine value of curent state in local copy of model
            z = (y - q[actions[i]])**2          # calculate loss due to this experience
            z.backward()                        # compute gradient and update weights using optimizer
            self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)