Ejemplo n.º 1
0
 def __init__(self,
              env,
              lr,
              critic_lr,
              gamma,
              n,
              policy_path,
              critic_path,
              load=False):
     # Initializes A2C.
     # Args:
     # - model: The actor model.
     # - lr: Learning rate for the actor model.
     # - critic_model: The critic model.
     # - critic_lr: Learning rate for the critic model.
     # - n: The value of N in N-step A2C.
     Reinforce.__init__(self,
                        env,
                        lr,
                        gamma=gamma,
                        save_path=policy_path,
                        load=load)
     self.critic_path = critic_path
     s_len = self.env.observation_space_shape[0]
     self.critic = CriticNet(critic_lr, s_len=s_len)
     self.n = n
     if load:
         self.critic.load(self.critic_path)
     print(
         "Hyperparameters:\nPolicy LR = {} Critic LR = {} Gamma = {} N = {} \nPolicy Path = {} \nCritic Path = {} \nLoad = {}"
         .format(lr, critic_lr, gamma, n, policy_path, critic_path, load))
     return
Ejemplo n.º 2
0
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 seed,
                 LR=5e-4,
                 gamma=0.95,
                 entropy_weight=0.02,
                 actor_network_max_grad_norm=5,
                 critic_network_max_grad_norm=5,
                 nstepqlearning_size=5,
                 gae_lambda=1.0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            LR (float): learning rate
            GAMMA (float): factor used to discount future values
            entropy_weight (float): weight of the entropy value using with the entropy_loss
            actor_network_max_grad_norm (float): threshold value used in gradient clipping in the actor model 
            critic_network_max_grad_norm (float): threshold value used in gradient clipping in the critic model 
            nstepqlearning_size (int): the number of steps used for the N-step bootstrapping algorithm
            gae_lambda (float): lambda used in GAE algorithm to use as discount factor in getting a mixture of every available estimated N-step bootstrapping results (from 1-5). 

        """
        self.state_size = state_size
        self.action_size = action_size
        self.entropy_weight = entropy_weight
        random.seed(seed)
        self.gamma = gamma
        self.actor_network_max_grad_norm = actor_network_max_grad_norm
        self.critic_network_max_grad_norm = critic_network_max_grad_norm
        self.nstepqlearning_size = nstepqlearning_size
        self.gae_lambda = gae_lambda
        self.device = device

        print("----Dumping agent hyperparameters---- ")
        print("LR: ", LR)
        print("gamma: ", gamma)
        print("actor_network_max_grad_norm: ",
              self.actor_network_max_grad_norm)
        print("critic_network_max_grad_norm: ",
              self.critic_network_max_grad_norm)
        print("nstepqlearning_size: ", self.nstepqlearning_size)
        print("gae_lambda: ", self.gae_lambda)
        print("entropy_weight: ", self.entropy_weight)
        print("------------------------------------- ")

        self.actor_net = ActorNet(state_size, action_size, device,
                                  seed).to(self.device)  # Theta
        self.critic_net = CriticNet(state_size, action_size,
                                    seed).to(self.device)  # Thetav
        self.actor_optimizer = optim.RMSprop(self.actor_net.parameters(),
                                             lr=LR)
        self.critic_optimizer = optim.RMSprop(self.critic_net.parameters(),
                                              lr=LR)
Ejemplo n.º 3
0
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Critic Network (w/ Target Network)
        self.critic_local = CriticNet(state_size, action_size, random_seed).to(device)
        self.critic_target = CriticNet(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
Ejemplo n.º 4
0
    def __init__(self, state_size, action_size, fc1_units, fc2_units, seed,
                 gamma, lr_actor, lr_critic, tau, buffer_size, batch_size,
                 weight_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int):                           dimension of each state
            action_size (int):                          dimension of each action
            fc1_units (int):                            number of nodes in layer 1 of neural network
            fc2_units (int):                            number of nodes in layer 2 of neural network
            seed (int):                                 seed
            gamma (float):                              discount parameter
            lr_actor (float):                           learning rate for Actor
            lr_critic (float):                          leanring rate for Critic
            tau (float):                                interpolation parameter
            buffer_size (int):                          size of memory buffer
            batch_size (int):                           number of experiences to sample during learning
            weight_decay (int):                         weight decay parameter
        """
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.tau = tau
        self.batch_size = batch_size

        # Neural Netowrk Params
        self.actor_target = ActorNet(state_size, action_size, fc1_units,
                                     fc2_units, seed).to(device)
        self.actor_local = ActorNet(state_size, action_size, fc1_units,
                                    fc2_units, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_target = CriticNet(state_size, action_size, fc1_units,
                                       fc2_units, seed).to(device)
        self.critic_local = CriticNet(state_size, action_size, fc1_units,
                                      fc2_units, seed).to(device)
        self.critic_optimizer = optim.Adam(self.actor_local.parameters(),
                                           lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Memory buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
Ejemplo n.º 5
0
    def __init__(self, state_size, action_size, num_agents, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents  #2
        self.seed = random.seed(seed)

        # Actor Network
        self.actor_local = ActorNet(state_size, action_size, seed).to(device)
        self.actor_target = ActorNet(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = CriticNet(state_size, action_size, seed).to(device)
        self.critic_target = CriticNet(state_size, action_size,
                                       seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # Q-Network
        #self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        #self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Noise process (Instead of epsilon in DQN) - taken from example
        self.noise = G_Noise((num_agents, action_size),
                             seed,
                             sigma=NOISE_SIGMA)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 6
0
def train(train_env_id: str,
          eval_env_id: str,
          logdir: str,
          cfg: ExperimentConfig,
          save_path: str,
          pretrain_path: Optional[str] = None) -> DDPGAgent:
    pretrain = torch.load(os.path.join(pretrain_path)) \
               if pretrain_path is not None            \
               else None
    env = set_env_metadata(train_env_id, cfg)
    train_env = make_vec_env(train_env_id,
                             num_envs=cfg.episodes_per_cycle,
                             no_timeout=True,
                             seed=cfg.seed)
    eval_env = make_vec_env(eval_env_id,
                            num_envs=cfg.num_eval_envs,
                            no_timeout=True,
                            seed=cfg.seed + 100)
    replay = HERReplayBuffer(cfg=cfg)
    tf_logger = TensorboardLogger(logdir)
    actor = ActorNet(obs_dim=cfg.obs_dim,
                     goal_dim=cfg.goal_dim,
                     action_dim=cfg.action_dim,
                     action_range=cfg.action_range,
                     zero_last=(pretrain_path is not None))
    critic = CriticNet(obs_dim=cfg.obs_dim,
                       goal_dim=cfg.goal_dim,
                       action_dim=cfg.action_dim,
                       action_range=cfg.action_range)
    normalizer = Normalizer(cfg.obs_dim+cfg.goal_dim) \
                 if pretrain is None                  \
                 else pretrain.normalizer
    agent = DDPGAgent(cfg=cfg,
                      actor=actor,
                      critic=critic,
                      normalizer=normalizer,
                      reward_fn=env.compute_reward,
                      pretrain=getattr(pretrain, 'actor', None))
    engine = DDPGEngine(cfg=cfg,
                        agent=agent,
                        train_env=train_env,
                        eval_env=eval_env,
                        replay=replay,
                        tf_logger=tf_logger)
    engine.train()

    env.close()
    train_env.close()
    eval_env.close()
    torch.save(agent, os.path.join(save_path))
    return agent
Ejemplo n.º 7
0
    def __init__(self, state_size, action_size, seed):
        """
        
        Initializes a DDPG Agent.

        params:
            - state_size (int)  : dimension of each state.
            - action_size (int) : dimension of each action.
            - seed (int)        : random seed.

        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.eps = EPSILON_START

        # Setup Actor Network
        self.actor_net = ActorNet(self.state_size, self.action_size, seed).to(device)
        self.target_actor_net = ActorNet(self.state_size, self.action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=LR_ACTOR)

        # Setup Critic Network
        self.critic_net = CriticNet(self.state_size, self.action_size, seed).to(device)
        self.target_critc_net = CriticNet(self.state_size, self.action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=LR_CRITIC)

        # noise process
        self.noise = OUNoise(self.action_size, seed)

        # create replay buffer
        self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        
        # timestep counter
        self.tstep = 0
Ejemplo n.º 8
0
    def __init__(self,
                 env,
                 batch_size,
                 replay_capacity,
                 episodes_before_train,
                 device='cpu'):

        self.env = env
        self.n_agents = env.n
        self.memory = memory.ReplayMemory(replay_capacity)

        self.actors = [
            ActorNet(env.observation_space[i].shape[0], env.action_space[i].n)
            for i in range(self.n_agents)
        ]
        self.critics = [
            CriticNet(env.observation_space[i].shape[0], env.n)
            for i in range(self.n_agents)
        ]

        self.critic_optimizers = [
            Adam(x.parameters(), lr=0.01) for x in self.critics
        ]
        self.actor_optimizers = [
            Adam(x.parameters(), lr=0.01) for x in self.actors
        ]

        self.actor_targets = deepcopy(self.actors)
        self.critic_targets = deepcopy(self.critics)

        self.device = device
        self.episodes_before_train = episodes_before_train
        self.batch_size = batch_size

        self.GAMMA = 0.95
        self.epsilon = 0.3

        for x in self.actors:
            x.to(device)
        for x in self.critics:
            x.to(device)
        for x in self.actor_targets:
            x.to(device)
        for x in self.critic_targets:
            x.to(device)
Ejemplo n.º 9
0
class A2C_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 seed,
                 LR=5e-4,
                 gamma=0.95,
                 entropy_weight=0.02,
                 actor_network_max_grad_norm=5,
                 critic_network_max_grad_norm=5,
                 nstepqlearning_size=5,
                 gae_lambda=1.0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            LR (float): learning rate
            GAMMA (float): factor used to discount future values
            entropy_weight (float): weight of the entropy value using with the entropy_loss
            actor_network_max_grad_norm (float): threshold value used in gradient clipping in the actor model 
            critic_network_max_grad_norm (float): threshold value used in gradient clipping in the critic model 
            nstepqlearning_size (int): the number of steps used for the N-step bootstrapping algorithm
            gae_lambda (float): lambda used in GAE algorithm to use as discount factor in getting a mixture of every available estimated N-step bootstrapping results (from 1-5). 

        """
        self.state_size = state_size
        self.action_size = action_size
        self.entropy_weight = entropy_weight
        random.seed(seed)
        self.gamma = gamma
        self.actor_network_max_grad_norm = actor_network_max_grad_norm
        self.critic_network_max_grad_norm = critic_network_max_grad_norm
        self.nstepqlearning_size = nstepqlearning_size
        self.gae_lambda = gae_lambda
        self.device = device

        print("----Dumping agent hyperparameters---- ")
        print("LR: ", LR)
        print("gamma: ", gamma)
        print("actor_network_max_grad_norm: ",
              self.actor_network_max_grad_norm)
        print("critic_network_max_grad_norm: ",
              self.critic_network_max_grad_norm)
        print("nstepqlearning_size: ", self.nstepqlearning_size)
        print("gae_lambda: ", self.gae_lambda)
        print("entropy_weight: ", self.entropy_weight)
        print("------------------------------------- ")

        self.actor_net = ActorNet(state_size, action_size, device,
                                  seed).to(self.device)  # Theta
        self.critic_net = CriticNet(state_size, action_size,
                                    seed).to(self.device)  # Thetav
        self.actor_optimizer = optim.RMSprop(self.actor_net.parameters(),
                                             lr=LR)
        self.critic_optimizer = optim.RMSprop(self.critic_net.parameters(),
                                              lr=LR)

    def tensor(self, x):
        if isinstance(x, torch.Tensor):
            return x
        x = np.asarray(x, dtype=np.float32)
        x = torch.from_numpy(x).to(self.device)
        return x

    def act(self, state):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor_net.eval()
        self.critic_net.eval()
        (actor_values, log_prob, entropy) = self.actor_net(state)
        critic_values = self.critic_net(state)
        self.actor_net.train()
        self.critic_net.train()
        return actor_values, log_prob, entropy, critic_values

    def train_one_episode(self, env, brain_name):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        num_agents = len(env_info.agents)

        states = env_info.vector_observations  # get the current state (for each agent)
        episode_terminated = False
        scores = np.zeros(num_agents)  # initialize the score (for each agent)

        while episode_terminated == False:
            l_states = []
            l_actions = []
            l_rewards = []  #np.zeros(( nstepqlearning_size, num_agents ))
            l_masks = []
            l_next_states = []
            l_values = []
            l_log_probs = []
            l_entropy = []

            nstep_memory_size = self.nstepqlearning_size
            for i in range(self.nstepqlearning_size):

                # Get a(t) according to actor policy
                (actions, log_prob, entropy, values) = self.act(states)
                actions = np.clip(
                    actions, -1, 1
                )  # Put all actions between -1 and 1. ( The last activation of the Actor is tanh, which puts the out values in this range,
                # but later we are sampling it which can produce values outside of this range)

                # Perform a(t) in all environments
                env_info = env.step(actions)[
                    brain_name]  # send all actions to tne environment

                # get s(t+1), r(t) and wasLastAction(t)
                next_states = env_info.vector_observations  # get next state (for each agent)
                rewards = env_info.rewards  # get reward (for each agent)
                dones = env_info.local_done  # see if episode finished

                masks = 1 - np.asarray(dones, np.int)

                l_states.append(states)
                l_actions.append(actions)
                l_rewards.append(rewards)
                l_masks.append(masks)
                l_next_states.append(next_states)
                l_values.append(values)
                l_log_probs.append(log_prob)
                l_entropy.append(entropy)

                # update score
                scores += env_info.rewards  # update the score (for each agent)

                states = next_states  # roll over states to next time step
                if np.any(dones):  # exit loop if episode terminated
                    nstep_memory_size = i + 1
                    episode_terminated = True
                    break

            # get one prediction for the last estimated Q value
            (_, _, _, values) = self.act(states)
            l_values.append(values)  # Add to the list, GAE will use it

            advantages = self.tensor(torch.zeros((num_agents))).to(self.device)
            returns = values.reshape((num_agents, )).to(
                self.device
            )  # last estimated Value ( of s(t+nstep_memory_size) )

            l_advantages = [None] * nstep_memory_size
            l_rets = [None] * nstep_memory_size
            l_masks = torch.tensor(np.array(l_masks)).to(self.device)
            l_rewards = torch.tensor(np.array(l_rewards)).to(self.device)

            for i in reversed(range(nstep_memory_size)):
                returns = l_rewards[i] + self.gamma * l_masks[i] * returns

                # Normal advantage calculation.
                #advantages = returns - l_values[i].detach().reshape((num_agents, ))

                # GAE
                td_error = l_rewards[i] + self.gamma * l_masks[i] * l_values[
                    i + 1] - l_values[i]
                advantages = advantages * self.gae_lambda * self.gamma * l_masks[
                    i] + td_error
                # GAE end

                l_advantages[i] = advantages.detach()
                l_rets[i] = returns.detach()

            # bring log_probs list to Tensor with shape [ num_agents,nstepqlearning_size ]
            logprobs = torch.cat(l_log_probs).squeeze()
            logprobs = logprobs.reshape(
                (nstep_memory_size * num_agents)).to(self.device)

            ents = torch.cat(l_entropy).squeeze()
            advantages_tensor = torch.cat(
                l_advantages, dim=0).squeeze().detach().to(self.device)

            policy_loss = -(logprobs * advantages_tensor).mean()

            # entropy: currently it's constant but I left it here, to make it possible to use different distribution parameters during the training process
            entropy_loss = ents.mean()

            # ==== train Critic ====
            self.critic_optimizer.zero_grad()
            l_rets = torch.cat(l_rets,
                               dim=0).squeeze().detach().to(self.device)
            l_values = torch.cat(l_values[:nstep_memory_size],
                                 dim=0).squeeze().to(self.device)
            v = 0.5 * (l_rets - l_values)
            value_loss = v.pow(2).mean()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic_net.parameters(),
                                           self.critic_network_max_grad_norm)
            self.critic_optimizer.step()

            # ==== train Actor ====
            self.actor_optimizer.zero_grad()
            # Add entropy term to the loss function to encourage having evenly distributed actions
            (policy_loss - self.entropy_weight * entropy_loss).backward()
            torch.nn.utils.clip_grad_norm_(self.actor_net.parameters(),
                                           self.actor_network_max_grad_norm)
            self.actor_optimizer.step()

        return scores
Ejemplo n.º 10
0
class Agents(): # based on DQN
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size,num_agents, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)


        # Actor Network
        self.actor_local = ActorNet(state_size, action_size, seed).to(device)
        self.actor_target = ActorNet(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network
        self.critic_local = CriticNet(state_size, action_size, seed).to(device)
        self.critic_target = CriticNet(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)

        # Q-Network
        #self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        #self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)


        # Noise process (Instead of epsilon in DQN) - taken from example
        self.noise = G_Noise((num_agents, action_size), seed,sigma=NOISE_SIGMA)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done,debugFlag=False):
        # Save experience in replay memory (For N agents)
        for i in range(self.num_agents):
            self.memory.add(state[i,:], action[i,:], reward[i], next_state[i,:], done[i])
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA,debugF = debugFlag)


    def act(self, state, noiseFlag=True):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        states = torch.from_numpy(state).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))

        self.actor_local.eval()
        with torch.no_grad():
            for i in range(self.num_agents):
                actions[i, :] = self.actor_local(states[i,:]).cpu().numpy()
        self.actor_local.train()

        if noiseFlag:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

        # Epsilon-greedy action selection
        #if random.random() > eps:
        #    return np.argmax(action_values.cpu().data.numpy())
        #else:
        #    return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma,debugF=False):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor


        Actor should learn the best argmax_a(Q(s,a)
        Critic should learn to expect the Q(s,a) where a is the chosen action by the actor
        """
        states, actions, rewards, next_states, dones = experiences

        #### CRITIC LEARN ####

        # calc a_next and Q(s,a)_next
        action_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, action_next)

        # calc estimated Q(s,a) (one-step boot straping)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #calc Q(s,a) from critic local (expected)
        Q_local = self.critic_local(states,actions.float())
        critic_loss = F.mse_loss(Q_local, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        #### ACTOR LEARN ####
        actions_predict = self.actor_local(states)
        actor_loss = -self.critic_local(states,actions_predict).mean()  ## we expected low value when actions are good, and minus for the learning direction

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)


        '''
        # Get max predicted Q values (for next states) from target model
        act_next_local = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, act_next_local) 
        
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        
        if debugF:
            #self.qnetwork_target.eval()
            #tmp = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
            #self.qnetwork_target.train()
            print(Q_targets_next)
            
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)    
        '''

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 11
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, fc1_units, fc2_units, seed,
                 gamma, lr_actor, lr_critic, tau, buffer_size, batch_size,
                 weight_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int):                           dimension of each state
            action_size (int):                          dimension of each action
            fc1_units (int):                            number of nodes in layer 1 of neural network
            fc2_units (int):                            number of nodes in layer 2 of neural network
            seed (int):                                 seed
            gamma (float):                              discount parameter
            lr_actor (float):                           learning rate for Actor
            lr_critic (float):                          leanring rate for Critic
            tau (float):                                interpolation parameter
            buffer_size (int):                          size of memory buffer
            batch_size (int):                           number of experiences to sample during learning
            weight_decay (int):                         weight decay parameter
        """
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.tau = tau
        self.batch_size = batch_size

        # Neural Netowrk Params
        self.actor_target = ActorNet(state_size, action_size, fc1_units,
                                     fc2_units, seed).to(device)
        self.actor_local = ActorNet(state_size, action_size, fc1_units,
                                    fc2_units, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_target = CriticNet(state_size, action_size, fc1_units,
                                       fc2_units, seed).to(device)
        self.critic_local = CriticNet(state_size, action_size, fc1_units,
                                      fc2_units, seed).to(device)
        self.critic_optimizer = optim.Adam(self.actor_local.parameters(),
                                           lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Memory buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if there are enough samples for a batch
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state)
        self.actor_local.train()

        if add_noise:
            action += torch.as_tensor(self.noise.sample()).float().to(device)
        return torch.clamp(action, -1, 1).cpu().data.numpy().tolist()

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        ### UPDATE CRITIC ###
        # Get predicted Q values (for next states) from target model
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute loss (critic)
        Q_expected = self.critic_local(states, actions.float())
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss (critic)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Use grad clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        ### UPDATE ACTOR ###
        # Compute loss (actor)
        actions_pred = self.actor_local(states)
        actor_loss = self.critic_local(states, actions_pred).mean()

        # Minimise the loss (actor)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # Use grad clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 12
0
class Critic():

    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Critic Network (w/ Target Network)
        self.critic_local = CriticNet(state_size, action_size, random_seed).to(device)
        self.critic_target = CriticNet(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

    def train(self, agent, memory, iterations=1, use_priority_mem=False):
        # Learn, if enough samples are available in memory
        if use_priority_mem and memory.priority_memory_filled():
            memory = memory.prioritized_memory
        else:
            memory = memory.memory

        if len(memory) > BATCH_SIZE:
            for _ in range(iterations):
                experiences = memory.sample()
                self.learn(agent, experiences, GAMMA)

    def learn(self, agent, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = agent.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = agent.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(agent.actor_local, agent.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 13
0
class A2C(Reinforce):
    # Implementation of N-step Advantage Actor Critic.
    # This class inherits the Reinforce class, so for example, you can reuse
    # generate_episode() here.

    def __init__(self,
                 env,
                 lr,
                 critic_lr,
                 gamma,
                 n,
                 policy_path,
                 critic_path,
                 load=False):
        # Initializes A2C.
        # Args:
        # - model: The actor model.
        # - lr: Learning rate for the actor model.
        # - critic_model: The critic model.
        # - critic_lr: Learning rate for the critic model.
        # - n: The value of N in N-step A2C.
        Reinforce.__init__(self,
                           env,
                           lr,
                           gamma=gamma,
                           save_path=policy_path,
                           load=load)
        self.critic_path = critic_path
        s_len = self.env.observation_space_shape[0]
        self.critic = CriticNet(critic_lr, s_len=s_len)
        self.n = n
        if load:
            self.critic.load(self.critic_path)
        print(
            "Hyperparameters:\nPolicy LR = {} Critic LR = {} Gamma = {} N = {} \nPolicy Path = {} \nCritic Path = {} \nLoad = {}"
            .format(lr, critic_lr, gamma, n, policy_path, critic_path, load))
        return

    def train(self):
        # Trains the model on a single episode using A2C.
        K = 500
        print("pretrain test:")
        print('episode 0 ', end='')
        self.test()
        print("training")
        # generate an episode
        gamma_n_1 = self.gamma**(self.n - 1)
        gamma_n = gamma_n_1 * self.gamma
        for i in range(10000000):
            s, ava, a, r = self.generate_episode()
            s = np.array(s)
            r = np.array(r)
            r /= 100.0
            T = len(r)
            if self.n >= T:
                n = T - 1
            else:
                n = self.n
            sum_r = np.zeros(shape=(T, ), dtype=np.float32)
            sum_r[T - 1] = r[T - 1]
            for p in range(2, n + 1, 1):
                sum_r[T - p] = sum_r[T - p + 1] * self.gamma + r[T - p]
            for q in range(n + 1, T + 1, 1):
                sum_r[T -
                      q] = (sum_r[T - q + 1] -
                            gamma_n_1 * r[T - q + n]) * self.gamma + r[T - q]

            V_end = np.zeros(shape=(T, ), dtype=np.float32)

            for j in range(6):
                V = self.critic.predict(s)
                V_end[0:T - n] = V[n:T]
                R = gamma_n * V_end + sum_r
                G = R - V
                self.model.fit(s, ava, a, G)
                self.critic.fit(s, R)

            if (i + 1) % K == 0:
                print('episode {} '.format(i + 1), end='')
                self.test()
                self.model.save(self.save_path)
                self.critic.save(self.critic_path)
        self.model.save(self.save_path)
        return
Ejemplo n.º 14
0
    model_fn = timestamp + "-checkpoint"
    log_dir = "./logs/"
    log_fn = timestamp + "-log.txt"
    tensorboard_dir = "./runs/"

    if not os.path.exists(tensorboard_dir):
        os.mkdir(tensorboard_dir) 
    if not os.path.exists(model_dir):
        os.mkdir(model_dir) 
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    cs_cnn = [16, 32, 64, 64]
    cs_fcn = [256, 64]
    actor  =         ActorNet(cs_cnn, size=STATE_SPACE_SIZE, c_in=HISTORY_LENGTH, c_out=ACTION_SPACE_SIZE[0]).to(device)                       # input is the state
    critic =        CriticNet(cs_cnn, cs_fcn, size_action=ACTION_SPACE_SIZE, size_state=(HISTORY_LENGTH, *STATE_SPACE_SIZE)).to(device)        # input is the concatenation of state and action
    actor_target  =  ActorNet(cs_cnn, size=STATE_SPACE_SIZE, c_in=HISTORY_LENGTH, c_out=ACTION_SPACE_SIZE[0]).to(device)                       # input is the state
    critic_target = CriticNet(cs_cnn, cs_fcn, size_action=ACTION_SPACE_SIZE, size_state=(HISTORY_LENGTH, *STATE_SPACE_SIZE)).to(device)        # input is the concatenation of state and action

    # actor_target.load_state_dict(actor.state_dict())
    # critic_target.load_state_dict(critic.state_dict())
    # for param in actor_target.parameters(): param.requires_grad_(False)
    # for param in critic_target.parameters(): param.requires_grad_(False)
    # actor_target.eval()
    # critic_target.eval()

    for param in actor_target.parameters(): param.requires_grad_(False)
    for param in critic_target.parameters(): param.requires_grad_(False)
    hard_update(actor_target, actor)
    hard_update(critic_target, critic)
Ejemplo n.º 15
0
class DDPGAgent:
    """ A DDPG Agent which interacts and learns from the environment. """

    def __init__(self, state_size, action_size, seed):
        """
        
        Initializes a DDPG Agent.

        params:
            - state_size (int)  : dimension of each state.
            - action_size (int) : dimension of each action.
            - seed (int)        : random seed.

        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.eps = EPSILON_START

        # Setup Actor Network
        self.actor_net = ActorNet(self.state_size, self.action_size, seed).to(device)
        self.target_actor_net = ActorNet(self.state_size, self.action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=LR_ACTOR)

        # Setup Critic Network
        self.critic_net = CriticNet(self.state_size, self.action_size, seed).to(device)
        self.target_critc_net = CriticNet(self.state_size, self.action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=LR_CRITIC)

        # noise process
        self.noise = OUNoise(self.action_size, seed)

        # create replay buffer
        self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        
        # timestep counter
        self.tstep = 0

    
    def step(self, states, actions, rewards, next_states, dones):
        # iterate through 20 agents
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            # save experiences in replay buffer
            self.buffer.push(state, action, reward, next_state, done)

        # Learn every C timesteps
        self.tstep = (self.tstep+1) % LEARN_EVERY

        if self.tstep == 0:
            # check if enough samples are available in buffer
            if len(self.buffer) > BATCH_SIZE:
                # Learn for a few iterations
                for _ in range(LEARN_FOR):
                    experiences = self.buffer.sample()
                    self.learn(experiences, GAMMA)
    
    def learn(self, experiences, gamma):
        """ 
        Updates policy and value params using given batch of experience tuples. 
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) = action
            critic_target(state, action) = Qvalue

        params:
            - experiences (Tuple([torch.Tensor])) : tuple of (s, a, r, s', done).
            - gamma (float)                       : discount factor.        
        """

        # unpack experiences
        s, a, r, ns, d = experiences

        #################### Update Critic ####################
        # get predicted next state actions from target models
        next_actions = self.target_actor_net(ns)
        # get predicted next state and Q values from target models
        next_Q_targets = self.target_critc_net(ns, next_actions)

        # Compute Q targets for current states 
        Q_targets = r + (gamma * next_Q_targets * (1 - d))

        # Compute critic loss
        Q_expected = self.critic_net(s, a)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm(self.critic_net.parameters(), 1.0)
        self.critic_optimizer.step()

        #######################################################

        #################### Update Actor ####################

        # compute actor loss
        predicted_actions = self.actor_net(s)
        actor_loss = - self.critic_net(s, predicted_actions).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #######################################################

        #################### Update Target Networks ####################
        self.soft_update(self.critic_net, self.target_critc_net, TAU)
        self.soft_update(self.actor_net, self.target_actor_net, TAU)

        # decay epsilon
        if self.eps > EPSILON_END:
            self.eps *= EPSILON_DECAY
            self.noise.reset()
        else:
            self.eps = EPSILON_END
    
    def soft_update(self, local, target, tau):
        """
        Performs a soft update for the parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target
        
        params:
            - TAU (float) : interpolation parameter. 
        """

        for target_param, local_param in zip(target.parameters(), local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
    
    def reset(self):
        """ This function resets the noise. """
        self.noise.reset()
    
    def act(self, state, add_noise=True):
        """ 
        Returns actions for a given state as per current policy.

        params:
            - state (array like)  : current state.
            - add_noise (boolean) : flag for adding noise.
        """

        state = torch.from_numpy(state).float().to(device)

        # set actor to eval mode
        self.actor_net.eval()

        with torch.no_grad():
            # get action values
            act_vals = self.actor_net(state).cpu().data.numpy()

        # turn back to train mode
        self.actor_net.train()

        # add noise
        if add_noise:
            act_vals += self.noise.sample()*self.eps

        

        return np.clip(act_vals, -1, 1)