Esempio n. 1
0
    def train(self,
              transitions: int,
              sigma_max: float = 1.,
              sigma_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              progress_upd_step: int = None,
              start_training: int = 1000,
              shaping_coef: float = 300.):
        history = ReplayBuffer(buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "sigma_max": sigma_max,
            "sigma_min": sigma_min,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()
        t = tqdm(range(transitions))
        for i in t:
            sigma = sigma_max - (sigma_max - sigma_min) * i / transitions
            action = self.act(state)
            noise = np.random.normal(scale=sigma, size=action.shape)
            action = np.clip(action + noise, -1, 1)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                batch = history.sample(batch_size)
                self.update_critic(batch)
                self.update_actor(batch)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
Esempio n. 2
0
    def train(self,
              transitions: int,
              eps_max: float = 0.5,
              eps_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              shaping_coef: float = 300.,
              progress_upd_step: int = None,
              start_training: int = 10000):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
Esempio n. 3
0
class TD3():
    """ Twin Delayed Deep Deterministic Policy Gradient Model """

    def __init__(self, state_size, action_size, random_seed):
        
                """ Initialize the model with arguments as follows:
                
                    ARGUMENTS
                    =========
                        - state_size (int) = dimension of input space
                        - action_size (int) = dimension of action space
                        - random_seed (int) = random seed

                    Returns 
                    =======
                        - best learned action to take after Actor-Critic Learning
                """
            
                self.state_size = state_size
                self.action_size = action_size
                self.seed = random.seed(random_seed)

                # create noise
                self.noise = OUNoise(action_size, random_seed)
                self.noise_decay = NOISE_DECAY
                
                # create memory
                self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
                


                # Actor Networks (local online net + target net)
                self.actor_local = Actor(state_size, action_size, random_seed).to(device)
                self.actor_target = Actor(state_size, action_size, random_seed).to(device)
                self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR)

                # Critic Networks (local online net + target net)
                self.critic_local = Critic(state_size, action_size, random_seed).to(device)
                self.critic_target = Critic(state_size, action_size, random_seed).to(device)
                self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
                # instantiate online and target networks with same weights
                self.soft_update(self.actor_local, self.actor_target, 1)
                self.soft_update(self.critic_local, self.critic_target, 1)
                
                self.learn_counter = 0
                
                
    def act(self, state, add_noise=True):
        """ Choose an action while interacting and learning in the environment """

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.noise_decay
            self.noise_decay *= self.noise_decay
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, noise_clip=0.5, policy_freq=2):
        """ Sample from experiences and learn """

        # update the learn counter
        self.learn_counter += 1

        # get experience tuples
        states, actions, rewards, next_states, dones  = experiences
            
        # build noise on the action 
        ##### CAVE: need to put actions onto cpu() to create a cpu tensor that is put onto CUDA with .to(device)
        #noise = torch.FloatTensor(actions.cpu()).data.normal_(0, policy_noise).to(device)
        #noise = noise.clamp(-noise_clip, noise_clip)
        ### <<--- adding this kind of noise was implemented in the paper on github,
        ### but i used OU-Noise in the act method, so maybe better to use the same while learning

        noise = torch.FloatTensor([self.noise.sample() for _ in range(len(actions))]).to(device)
        noise = noise.clamp(-noise_clip, noise_clip)  
        # clip between -/+ max action dims because action+noise might run oor
        next_action = (self.actor_target(next_states) + noise).clamp(-1, 1)

        # compute the target Q value
        target_Q1, target_Q2 = self.critic_target(next_states, next_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = rewards + (gamma * target_Q * (1-dones)).detach()

        # get current Q estimates
        current_Q1, current_Q2 = self.critic_local(states, actions)

        # compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # update the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # delay the policy update
        if self.learn_counter % policy_freq == 0:
                    
                # get actor_local predicted next action and use critic_local to complete
                actions_pred = self.actor_local.forward(states)
                actor_loss = -self.critic_local.Q1(states, actions_pred).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # delay update of actor and critic target models
                self.soft_update(self.actor_local, self.actor_target, TAU)
                self.soft_update(self.critic_local, self.critic_target, TAU)


    def soft_update(self, local_model, target_model, tau):
        # Perform soft update of the target networks
        # at every time step, keep 1-tau of target network
        # and add only a small fraction (tau) of the current online networks
        # to prevent oszillation
        for local_param, target_param in zip(local_model.parameters(), target_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def step(self, state, action, reward, next_state, done):
        # at every iteration, add new SARS' trajectory to memory, then learn from batches 
        # if learning_step is reached and enough samples are in the buffer
        
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
Esempio n. 4
0
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers_actor=[32, 32],
                 hidden_layers_critic=[32, 32, 32],
                 buffer_size=int(1e5),
                 batch_size=128,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate_actor=1e-4,
                 learning_rate_critic=5e-4,
                 weight_decay=0.0001,
                 update_every=20,
                 num_batches=10,
                 add_noise=True,
                 head_name_actor='Actor',
                 head_name_critic="DuelingDQN",
                 head_scale_actor='max',
                 head_scale_critic="max"):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate_X (float; optional): learning rate for X=actor or critic
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = learning_rate_actor
        self.lr_critic = learning_rate_critic
        self.update_every = update_every
        self.num_batches = num_batches
        self.weight_decay_critic = weight_decay
        self.add_noise = add_noise

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        ### SET UP THE ACTOR NETWORK ###
        # Assign model parameters and assign device
        model_params_actor = [
            state_size, action_size, seed, hidden_layers_actor,
            head_name_actor, head_scale_actor
        ]

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(*model_params_actor).to(self.device)
        self.actor_target = Actor(*model_params_actor).to(self.device)

        # Set up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        ### SET UP THE CRITIC NETWORK ###
        model_params_critic = [
            state_size, action_size, seed, hidden_layers_critic,
            head_name_critic, head_scale_critic
        ]

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(*model_params_critic).to(self.device)
        self.critic_target = Critic(*model_params_critic).to(self.device)

        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.lr_critic,
            weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, timestep):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory
               ) > self.batch_size and timestep % self.update_every == 0:
            for i in range(self.num_batches):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(self.device)

        # Go to evaluation mode and get Q values for current state
        self.actor_local.eval()
        with torch.no_grad():
            action_values = self.actor_local(state).cpu().data.numpy()

        # get back to train mode
        self.actor_local.train()

        # Add noise to the action probabilities
        if add_noise:
            action_values += self.noise.sample()
        return np.clip(action_values, -1.0, 1.0)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks using the local and target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        X_target = tau*X_local + (1 - tau)*X_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 drop_p=0.3,
                 with_dueling=False,
                 isDDQN=False):
        """Initialize an Agent object.
        
        Params  
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (array): Hidden number of nodes in each layer
            drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout)
            with_dueling (boolean) : If true, network is dueling network, otherwise false.
            isDDQN (boolean) : If true, double dqn in implemented, otherwise false.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       hidden_layers=hidden_layers,
                                       drop_p=drop_p,
                                       dueling=with_dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        hidden_layers=hidden_layers,
                                        drop_p=drop_p,
                                        dueling=with_dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Parameter instance of DDQN.
        self.isDDQN = isDDQN

    def step(self, state, action, reward, next_state, done):
        """Takes a step and with each time step sample from buffer and learn"""
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if self.isDDQN:
            # Get optimal action from local model and feed forward next_states on target network
            best_local_actions = self.qnetwork_local(states).max(
                1)[1].unsqueeze(1)
            double_dqn_targets = self.qnetwork_target(next_states)
            # Get value of the target dqn vialocal optimal action
            Q_targets_next = torch.gather(double_dqn_targets, 1,
                                          best_local_actions)
        else:
            # Get max predicted Q values (for next states) from target model (without ddqn)
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 6
0
class DDPG(Model):
    """ Interface """
    def __init__(self,
                 name,
                 args,
                 sess=None,
                 reuse=False,
                 log_tensorboard=True,
                 save=True):
        self.learn_steps = 0

        # hyperparameters
        self.gamma = args[name]['gamma']
        self.tau = args[name]['tau']
        self.init_noise_sigma = args[name]['init_noise_sigma']
        self.noise_decay = args[name]['noise_decay']

        # replay buffer
        self.buffer = ReplayBuffer(sample_size=args['batch_size'],
                                   max_len=args[name]['buffer_size'])

        super(DDPG, self).__init__(name,
                                   args,
                                   sess=sess,
                                   reuse=reuse,
                                   build_graph=True,
                                   log_tensorboard=log_tensorboard,
                                   save=save)

        self._initialize_target_net()

    @property
    def main_variables(self):
        return self.actor_critic.trainable_variables

    @property
    def _target_variables(self):
        return self._target_actor_critic.trainable_variables

    def act(self, state):
        self.sess.run(self.noise_op)
        state = state.reshape((-1, self.state_size))
        action = self.sess.run(self.actor_critic.actor_action,
                               feed_dict={self.actor_critic.state: state})
        self.sess.run(self.denoise_op)
        return np.squeeze(action)

    def step(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)

        if len(self.buffer) > self.buffer.sample_size + 100:
            self._learn()

    """ Implementation """

    def _build_graph(self):
        # env info
        self._setup_env()

        # main actor-critic
        self.actor_critic = self._create_actor_critic()
        # target actor-critic
        self._target_actor_critic = self._create_actor_critic(is_target=True)

        # losses
        self.actor_loss, self.critic_loss = self._loss()

        # optimizating operation
        self.opt_op = self._optimize([self.actor_loss, self.critic_loss])

        # target net update operations
        self.init_target_op, self.update_target_op = self._targetnet_ops()

        # operations that add/remove noise from parameters
        self.noise_op, self.denoise_op = self._noise_params()

    def _setup_env(self):
        self.state_size = self._args[self.name]['state_size']
        self.action_size = self._args[self.name]['action_size']
        self.env_info = {}
        with tf.name_scope('placeholders'):
            self.env_info['state'] = tf.placeholder(tf.float32,
                                                    shape=(None,
                                                           self.state_size),
                                                    name='state')
            self.env_info['action'] = tf.placeholder(tf.float32,
                                                     shape=(None,
                                                            self.action_size),
                                                     name='action')
            self.env_info['next_state'] = tf.placeholder(
                tf.float32, shape=(None, self.state_size), name='next_state')
            self.env_info['reward'] = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='reward')
            self.env_info['done'] = tf.placeholder(tf.uint8,
                                                   shape=(None, 1),
                                                   name='done')

    def _create_actor_critic(self, is_target=False):
        name = 'target_actor_critic' if is_target else 'actor_critic'
        log_tensorboard = False if is_target else True
        actor_critic = ActorCritic(name,
                                   self._args,
                                   self.env_info,
                                   self.action_size,
                                   reuse=self.reuse,
                                   log_tensorboard=log_tensorboard,
                                   is_target=is_target)

        return actor_critic

    def _loss(self):
        with tf.name_scope('loss'):
            with tf.name_scope('l2_loss'):
                encoder_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/state_encoder',
                    name='encoder_l2_loss')
                actor_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/actor',
                    name='actor_l2_loss')
                critic_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/critic',
                    name='critic_l2_loss')

            with tf.name_scope('actor_loss'):
                actor_loss = tf.negative(
                    tf.reduce_mean(self.actor_critic.Q_with_actor),
                    name='actor_loss') + encoder_l2_loss + actor_l2_loss

            with tf.name_scope('critic_loss'):
                target_Q = tf.stop_gradient(
                    self.env_info['reward'] +
                    self.gamma * tf.cast(1 - self.env_info['done'], tf.float32)
                    * self._target_actor_critic.Q_with_actor,
                    name='target_Q')
                critic_loss = tf.losses.mean_squared_error(
                    target_Q,
                    self.actor_critic.Q) + encoder_l2_loss + critic_l2_loss

            if self.log_tensorboard:
                tf.summary.scalar('actor_l2_loss_', actor_l2_loss)
                tf.summary.scalar('critic_l2_loss_', critic_l2_loss)
                tf.summary.scalar('encoder_l2_loss_', encoder_l2_loss)
                tf.summary.scalar('actor_loss_', actor_loss)
                tf.summary.scalar('critic_loss_', critic_loss)

        return actor_loss, critic_loss

    def _optimize(self, losses):
        with tf.variable_scope('optimizer'):
            actor_loss, critic_loss = losses
            actor_opt_op = self._optimize_objective(actor_loss, 'actor')
            critic_opt_op = self._optimize_objective(critic_loss, 'critic')

            opt_op = tf.group(actor_opt_op, critic_opt_op)

        return opt_op

    def _optimize_objective(self, loss, name):
        # params for optimizer
        learning_rate = self._args['actor_critic'][name][
            'learning_rate'] if 'learning_rate' in self._args['actor_critic'][
                name] else 1e-3
        beta1 = self._args['actor_critic'][name][
            'beta1'] if 'beta1' in self._args['actor_critic'][name] else .9
        beta2 = self._args['actor_critic'][name][
            'beta2'] if 'beta2' in self._args['actor_critic'][name] else .999
        clip_norm = self._args[name]['actor_critic'][
            'clip_norm'] if 'clip_norm' in self._args['actor_critic'] else 5.

        with tf.variable_scope(name + '_opt', reuse=self.reuse):
            # setup optimizer
            self._optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate, beta1=beta1, beta2=beta2)

            tvars = self.actor_critic.actor_trainable_variables if name == 'actor' else self.actor_critic.critic_trainable_variables
            grads, tvars = list(
                zip(*self._optimizer.compute_gradients(loss, var_list=tvars)))
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)
            opt_op = self._optimizer.apply_gradients(zip(grads, tvars))

        if self.log_tensorboard:
            with tf.name_scope(name):
                with tf.name_scope('gradients_'):
                    for grad, var in zip(grads, tvars):
                        if grad is not None:
                            tf.summary.histogram(var.name.replace(':0', ''),
                                                 grad)
                with tf.name_scope('params_'):
                    for var in tvars:
                        tf.summary.histogram(var.name.replace(':0', ''), var)

        return opt_op

    def _targetnet_ops(self):
        with tf.name_scope('target_net_op'):
            target_main_var_pairs = list(
                zip(self._target_variables, self.main_variables))
            init_target_op = list(
                map(lambda v: tf.assign(v[0], v[1], name='init_target_op'),
                    target_main_var_pairs))
            update_target_op = list(
                map(
                    lambda v: tf.assign(v[0],
                                        self.tau * v[1] +
                                        (1. - self.tau) * v[0],
                                        name='update_target_op'),
                    target_main_var_pairs))

        return init_target_op, update_target_op

    def _learn(self):
        states, actions, rewards, next_states, dones = self.buffer.sample()

        feed_dict = {
            self.env_info['state']: states,
            self.env_info['action']: actions,
            self.env_info['reward']: rewards,
            self.env_info['next_state']: next_states,
            self.env_info['done']: dones,
        }

        # update the main networks
        if self.log_tensorboard:
            _, summary = self.sess.run([self.opt_op, self.merged_op],
                                       feed_dict=feed_dict)
            self.writer.add_summary(summary, self.learn_steps)
        else:
            _ = self.sess.run(self.opt_op, feed_dict=feed_dict)

        # update the target networks
        self.sess.run(self.update_target_op)

        self.learn_steps += 1

    def _noise_params(self):
        with tf.variable_scope('noise'):
            noise_sigma = tf.get_variable('noise_sigma',
                                          initializer=self.init_noise_sigma,
                                          trainable=False)

            noise_decay_op = tf.assign(noise_sigma,
                                       self.noise_decay * noise_sigma,
                                       name='noise_decay_op')

            param_noise_pairs = []
            for var in self.actor_critic.actor_perturbable_variables:
                noise = tf.truncated_normal(tf.shape(var), stddev=noise_sigma)
                param_noise_pairs.append((var, noise))

            with tf.control_dependencies([noise_decay_op]):
                noise_op = list(
                    map(
                        lambda v: tf.assign(v[0], v[0] + v[1], name='noise_op'
                                            ), param_noise_pairs))
                denoise_op = list(
                    map(
                        lambda v: tf.assign(
                            v[0], v[0] - v[1], name='denoise_op'),
                        param_noise_pairs))

        return noise_op, denoise_op

    def _initialize_target_net(self):
        self.sess.run(self.init_target_op)
Esempio n. 7
0
class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()
class DQN_Agent():
    """ Interacts an learns from the environment. """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 GAMMA=GAMMA,
                 TAU=TAU,
                 LR=LR,
                 UPDATE_EVERY=UPDATE_EVERY,
                 BUFFER_SIZE=BUFFER_SIZE,
                 BATCH_SIZE=BATCH_SIZE):
        """ Initialize the agent.
        ==========
        PARAMETERS 
        ==========
            state_size (int) = observation dimension of the environment
            action_size (int) = dimension of each action
            seed (int) = random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # instantiate online local and target network for weight updates
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        # create a replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # time steps for updating target network every time t_step % 4 == 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        ''' Append a SARS sequence to memory, then every update_every steps learn from experiences'''
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # in case enough samples are available in internal memory, sample and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """ Choose action from an epsilon-greedy policy
        ==========
        PARAMETERS
        ==========
            state (array) = current state space
            eps (float) = epsilon, for epsilon-greedy action choice """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ Update the value parameters using experience tuples sampled from ReplayBuffer
        ==========
        PARAMETERS
        ==========
          experiences = Tuple of torch.Variable: SARS', done
          gamma (float) = discount factor to weight rewards
        """

        states, actions, rewards, next_states, dones = experiences

        # calculate max predicted Q values for the next states using target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # calculate expected Q vaues from the local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # compute MSE Loss
        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update for model parameters, every update steps as defined above
        theta_target = tau * theta_local + (1-tau)*theta_target 

        ==========
        PARAMETERS 
        ==========
          local_model, target_model = PyTorch Models, weights will be copied from-to
          tau = interpolation parameter, type=float 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Esempio n. 9
0
class DDPG:
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)

    def update_target(self):
        # Two methods to update the target actor
        # Method 1:
        self.target_actor.set_weights(
            np.array(self.actor.get_weights()) * self.TAU +
            np.array(self.target_actor.get_weights()) * (1 - self.TAU))
        self.target_critic.set_weights(
            np.array(self.critic.get_weights()) * self.TAU +
            np.array(self.target_critic.get_weights()) * (1 - self.TAU))
        """
        # Method 2:
        new_weights = []
        target_variables = self.target_critic.weights
        for i, variable in enumerate(self.critic.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))

        self.target_critic.set_weights(new_weights)
        new_weights = []
        target_variables = self.target_actor.weights
        for i, variable in enumerate(self.actor.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))
        self.target_actor.set_weights(new_weights)
        """

    def train_step(self):
        s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch(
            self.minibatch_size)
        """
        mu_prime = self.target_actor(s2_batch)  # predictions by target actor
        Q_prime = self.target_critic([s2_batch, mu_prime])  # predictions by target critic
        y = np.zeros_like(Q_prime)
        for k in range(self.minibatch_size):
            if d_batch[k]:
                y[k] = r_batch[k]
            else:
                y[k] = r_batch[k] + self.GAMMA * Q_prime[k]
        # y = r_batch + gamma * Q_prime

        checkpoint_path = "training/cp_critic.ckpt"
        checkpoint_dir = os.path.dirname(checkpoint_path)
        # Create a callback that saves the model's weights
        cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,
                                                          save_weights_only=True,
                                                          verbose=1)
        self.critic.train_on_batch([s_batch, a_batch], y)
        # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1])

        with tf.GradientTape(persistent=True) as tape:
            a = self.actor(s_batch)
            tape.watch(a)
            theta = self.actor.trainable_variables
            q = self.critic([s_batch, a])
        dq_da = tape.gradient(q, a)
        da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da)
        self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables))
        """

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(s2_batch)
            y = r_batch + self.GAMMA * self.target_critic(
                [s2_batch, target_actions])
            critic_value = self.critic([s_batch, a_batch])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_opt.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(s_batch)
            q = self.critic([s_batch, actions])  # critic_value
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(q)
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))
        self.update_target()
        return np.mean(q)

    def policy(self, s):
        # since batch normalization is done on self.actor, it is multiplied with upper_bound
        if s.ndim == 1:
            s = s[None, :]
        action = self.actor(s) * self.upper_bound + self.ou_noise()
        action = np.clip(action, self.lower_bound, self.upper_bound)
        return action

    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)

    def save_weights(self, save_name='final_weights'):
        self.actor.save_weights("training/%s_actor.h5" % save_name)
        self.critic.save_weights("training/%s_critic.h5" % save_name)
        self.target_actor.save_weights("training/%s_target_actor.h5" %
                                       save_name)
        self.target_critic.save_weights("training/%s_target_critic.h5" %
                                        save_name)

        # to save in other format
        self.target_actor.save_weights('training/%s_actor_weights' % save_name,
                                       save_format='tf')
        self.target_critic.save_weights('training/%s_critic_weights' %
                                        save_name,
                                        save_format='tf')
        print('Training completed and network weights saved')

    # For evaluation of the policy learned
    def collect_data(self, act_net, iterations=1000):
        a_all, states_all = [], []
        obs = self.env.reset()
        for t in range(iterations):
            obs = np.squeeze(obs)
            if obs.ndim == 1:
                a = act_net(obs[None, :])
            else:
                a = act_net(obs)
            obs, _, done, _ = self.env.step(a)
            states_all.append(obs)
            a_all.append(a)
            # self.env.render()  # Uncomment this to see the actor in action (But not in python notebook)
            # if done:
            #     break
        states = np.squeeze(
            np.array(states_all))  # cos(theta), sin(theta), theta_dot
        a_all = np.squeeze(np.array(a_all))
        return states, a_all

    def plot_results(self,
                     avg_reward=None,
                     actions=None,
                     states=None,
                     train=False,
                     title=None):
        # An additional way to visualize the avg episode rewards
        if train:
            plt.figure()
            plt.plot(avg_reward)
            plt.xlabel("Episode")
            plt.ylabel("Avg. Epsiodic Reward")
            plt.show()
        else:  # work only for Pendulum-v0 environment
            fig, ax = plt.subplots(3, sharex=True)
            theta = np.arctan2(states[:, 1], states[:, 0])
            ax[0].set_ylabel('u')
            ax[0].plot(np.squeeze(actions))
            ax[1].set_ylabel(u'$\\theta$')
            ax[1].plot(theta)
            # ax[1].plot(states[:, 0])
            ax[2].set_ylabel(u'$\omega$')
            ax[2].plot(states[:, 2])  # ang velocity
            fig.canvas.set_window_title(title)
Esempio n. 10
0
class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every

        # model settings
        self.qnet_local = Model(state_size, action_size).to(self.device)
        self.qnet_target = Model(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

        # replay buffer settings
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.update_step = 0

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.update_step = (self.update_step + 1) % self.update_every
        if (self.update_step
                == 0) and (len(self.replay_buffer) > self.batch_size):
            experiences = self.replay_buffer.sample()
            self.learn(experiences)

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        self.qnet_local.eval()
        with torch.no_grad():
            action_values = self.qnet_local(state)
        self.qnet_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.random.choice(self.action_size)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # convert to tensors and send to device
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        next_states = torch.from_numpy(next_states).float().to(self.device)
        dones = torch.from_numpy(dones).float().to(self.device)

        # max returns max values (0) and indices (1)
        # unsqueeze is needed to add batch dim B x 1
        q_max = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)
        y = rewards + self.gamma * q_max * (1 - dones)

        # select action values corresponding to actions
        # this is what .gather does
        # note for the expected we pass states, not next_states
        q_expected = self.qnet_local(states).gather(1, actions)

        loss = F.mse_loss(q_expected, y)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update()

    def soft_update(self):
        for target_param, local_param in zip(self.qnet_target.parameters(),
                                             self.qnet_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1 - self.tau) * target_param.data)

    def train(self,
              env,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        scores = []
        scores_window = deque(maxlen=100)
        eps = eps_start

        brain_name = env.brain_names[0]

        for i_episode in range(1, n_episodes + 1):

            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]

            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]

                self.step(state, action, reward, next_state, done)

                state = next_state
                score += reward

                if done:
                    break
            scores_window.append(score)
            scores.append(score)
            avg_scores = np.mean(scores_window)
            eps = max(eps_end, eps_decay * eps)

            print(f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}',
                  end='')
            if i_episode % 100 == 0:
                print(
                    f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}')
            if avg_scores >= 13.0:
                print(f'\nEnvironment solved in {i_episode - 100} episodes!'
                      f'\tAverage Score: {np.mean(scores_window):.2f}')
                torch.save(self.qnet_local.state_dict(), 'checkpoint.pth')
                break

        return scores

    def evaluate(self, env):

        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]

        score = 0
        for i in range(2000):

            action = self.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            state = next_state

            score += reward
            if done:
                break

        print(f'Total score: {score:.2f}')
Esempio n. 11
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 12
0
    def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000,
              batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = 0,
              start_training: int = 10000, to_sink: bool = False):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "bins": self.num_bins,
            "to_sink": to_sink,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1]))
            done_ = next_state[0] > 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            # soft update
            with torch.no_grad():
                for param, param_target in zip(self.dqn.parameters(), self.dqn_target.parameters()):
                    param_target.data.mul_(1 - self.tau)
                    param_target.data.add_(self.tau * param.data)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}")

                if to_sink and reward_mean >= 90 and self.evaluate_policy(episodes=100)[0] >= 90:
                    self.sink(history, start_training, eps, shaping_coef)
                    shaping_coef = 1
                    to_sink = False

        return log
def main():

    ##########
    # CONFIG #
    ##########
    # Target Reward
    tgt_score = 0.5
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Seed
    seed = 7
    seeding(seed)
    # Model Architecture
    # Actor
    hidden_in_actor = 256
    hidden_out_actor = 128
    lr_actor = 1e-4
    # Critic
    hidden_in_critic = 256
    hidden_out_critic = 128
    lr_critic = 3e-4
    weight_decay_critic = 0
    # Episodes
    number_of_episodes = 10000
    episode_length = 2000
    # Buffer
    buffer_size = int(1e6)
    batchsize = 512
    # Agent Update Frequency
    episode_per_update = 1
    # Rewards Discounts Factor
    discount_factor = 0.95
    # Soft Update Weight
    tau = 1e-2
    # Noise Process
    noise_factor = 2
    noise_reduction = 0.9999
    noise_floor = 0.0
    # Window
    win_len = 100
    # Save Frequency
    save_interval = 200
    # Logger
    log_path = os.getcwd() + "/log"
    logger = SummaryWriter(log_dir=log_path)
    # Model Directory
    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    # Load Saved Model
    load_model = False

    ####################
    # Load Environment #
    ####################
    env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64")
    # Get brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print('Brain Name:', brain_name)
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Number of Agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    ####################
    # Show Progressbar #
    ####################
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
    start = time.time()

    ###############
    # Multi Agent #
    ###############
    maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor,
                    hidden_out_actor, lr_actor, hidden_in_critic,
                    hidden_out_critic, lr_critic, weight_decay_critic,
                    discount_factor, tau, seed, device)

    if load_model:
        load_dict_list = torch.load(os.path.join(model_dir,
                                                 'episode-saved.pt'))
        for i in range(num_agents):
            maddpg.maddpg_agent[i].actor.load_state_dict(
                load_dict_list[i]['actor_params'])
            maddpg.maddpg_agent[i].actor_optimizer.load_state_dict(
                load_dict_list[i]['actor_optim_params'])
            maddpg.maddpg_agent[i].critic.load_state_dict(
                load_dict_list[i]['critic_params'])
            maddpg.maddpg_agent[i].critic_optimizer.load_state_dict(
                load_dict_list[i]['critic_optim_params'])

    #################
    # Replay Buffer #
    #################
    rebuffer = ReplayBuffer(buffer_size, seed, device)

    #################
    # TRAINING LOOP #
    #################
    # initialize scores
    scores_history = []
    scores_window = deque(maxlen=save_interval)

    # i_episode = 0
    for i_episode in range(number_of_episodes):
        timer.update(i_episode)

        # Reset Environmet
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        # Reset Agent
        maddpg.reset()

        # episode_t = 0
        for episode_t in range(episode_length):

            # Explore with decaying noise factor
            actions = maddpg.act(states, noise_factor=noise_factor)
            env_info = env.step(actions)[brain_name]  # Environment reacts
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished

            ###################
            # Save Experience #
            ###################
            rebuffer.add(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states

            if any(dones):
                break

        scores_history.append(np.max(scores))  # save most recent score
        scores_window.append(np.max(scores))  # save most recent score
        avg_rewards = np.mean(scores_window)
        noise_factor = max(noise_floor, noise_factor *
                           noise_reduction)  # Reduce Noise Factor

        #########
        # LEARN #
        #########
        if len(rebuffer) > batchsize and i_episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = rebuffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # Soft Update
            maddpg.update_targets()

        ##################
        # Track Progress #
        ##################
        if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")

        ##############
        # Save Model #
        ##############
        save_info = ((i_episode) % save_interval == 0
                     or i_episode == number_of_episodes)
        if save_info:
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)
            torch.save(save_dict_list,
                       os.path.join(model_dir, 'episode-Latest.pt'))

            pd.Series(scores_history).to_csv(
                os.path.join(model_dir, "scores.csv"))

            # plot the scores
            rolling_mean = pd.Series(scores_history).rolling(win_len).mean()
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores_history)), scores_history)
            plt.axhline(y=tgt_score, color='r', linestyle='dashed')
            plt.plot(rolling_mean, lw=3)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            # plt.show()
            fig.savefig(os.path.join(model_dir, 'Average_Score.pdf'))
            fig.savefig(os.path.join(model_dir, 'Average_Score.jpg'))
            plt.close()

        if avg_rewards > tgt_score:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")
            break

    env.close()
    logger.close()
    timer.finish()
Esempio n. 14
0
class MADDPG:
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device,
                 random_seed=4,
                 lr_critic=1.0e-4,
                 weight_decay=0.0):
        super(MADDPG, self).__init__()

        # parameter configuration
        self.num_agents = num_agents
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents
        self.global_action_size = global_action_size
        self.global_obs_dim = global_obs_dim
        torch.manual_seed(random_seed)
        random.seed(random_seed)
        self.random_seed = random_seed
        self.weight_decay = weight_decay

        # define actors
        self.actors = [
            DDPGActor(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=device) for _ in range(num_agents)
        ]
        # define centralized critic
        self.critic = Critic(global_obs_dim, global_action_size,
                             self.random_seed).to(self.device)
        self.target_critic = Critic(global_obs_dim, global_action_size,
                                    self.random_seed).to(self.device)
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)

        # noise coef
        self.noise_coef = 1.0
        self.noise_coef_decay = 1e-6

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def act(self, obs_all_agents):
        actions = [
            ddpg_actor.act(local_obs, self.noise_coef)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def target_act(self, obs_all_agents):
        actions = [
            ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full,
             dones, timestep):
        self.memory.add(obs, obs_full, actions, rewards, next_obs,
                        next_obs_full, dones)

        timestep = timestep % TRAIN_EVERY

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep == 0:
            for _ in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.discount_factor)

    def learn(self, experiences, gamma):
        obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences

        obs = obs.permute(1, 0, -1)  # agent_id * batch_size * state_size
        obs_full = obs_full.view(-1, self.global_obs_dim)
        next_obs = next_obs.permute(1, 0, -1)
        next_obs_full = next_obs_full.view(-1, self.global_obs_dim)
        action = action.reshape(-1, self.global_action_size)

        # ---------------- update centralized critic ----------------------- #
        self.critic_optimizer.zero_grad()

        # get target actions from all target_actors
        target_actions = np.array(self.target_act(next_obs))
        target_actions = torch.from_numpy(target_actions).float().permute(
            1, 0, -1)
        target_actions = target_actions.reshape(-1, self.global_action_size)

        # update critic
        with torch.no_grad():
            q_next = self.target_critic.forward(next_obs_full,
                                                target_actions.to(self.device))

        y = reward + gamma * q_next * (1 - done)

        q = self.critic.forward(obs_full, action)

        critic_loss = 0
        for i in range(self.num_agents):
            critic_loss += F.mse_loss(q, y[:, i].detach().reshape(
                -1, 1)) / self.num_agents
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------- update actor for all agents --------------------- #
        for ii in range(len(self.actors)):
            self.actors[ii].actor_optimizer.zero_grad()

            q_action = [ self.actors[i].actor_local(ob) if i == ii \
                   else self.actors[i].actor_local(ob).detach()
                   for i, ob in enumerate(obs) ]

            q_action = torch.stack(q_action).permute(1, 0, -1)
            q_action = q_action.reshape(-1, self.global_action_size).to(
                self.device)

            # policy_gradient
            actor_loss = -self.critic.forward(obs_full, q_action).mean()
            actor_loss.backward()
            self.actors[ii].actor_optimizer.step()

        # --------------- soft update all target networks ------------------- #
        soft_update(self.target_critic, self.critic, self.tau)
        for actor in self.actors:
            actor.update_target(self.tau)

        # -------------- reset noise --------------------------------------- #
        for actor in self.actors:
            actor.action_noise.reset()

        self.noise_coef -= self.noise_coef_decay
        if self.noise_coef < 0.01:
            self.noise_coef = 0.01
Esempio n. 15
0
class DDPG:
    def __init__(self, env, batch_size, mem_size, discount, actor_params,
                 critic_params):
        self._batch_size = batch_size
        self._mem_size = mem_size
        self._discount = discount
        self._sess = tensorflow.Session()
        k_backend.set_session(self._sess)
        self._env = env
        self._state_dim = env.observation_space.shape[0]
        self._action_dim = env.action_space.shape[0]
        self._action_min = env.action_space.low
        self._action_max = env.action_space.high
        self._state_min = env.observation_space.low
        self._state_max = env.observation_space.high
        self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                            self._action_min, self._action_max, actor_params)
        self._critic = Critic(self._sess, 0.5, self._state_dim,
                              self._action_dim, critic_params)
        self._memory = ReplayBuffer(mem_size)

    def get_action(self, state):
        return self._actor._model.predict(state)

    def train(self):
        '''
        No training takes place until the replay buffer contains
        at least batch size number of experiences
        '''

        if (self._memory.size() > self._batch_size):
            self._train()

    def _train(self):
        states, actions, rewards, done, next_states = self._memory.sample(
            self._batch_size)
        self._train_critic(states, actions, rewards, done, next_states)
        action_gradients = self._critic.action_gradients(states, actions)
        self._actor.train(states, action_gradients)

    def q_estimate(self, state, action):
        return self._critic._model.predict(state, action)

    def _get_q_targets(self, next_states, done, rewards):
        '''
        q = r if done else =  r + gamma * qnext
        '''
        # use actor network to determine the next action under current policy
        # estimate Q values from the critic network

        actions = self.get_action(next_states)
        qnext = self.q_estimate(next_states, actions)

        q_targets = [
            reward if end else reward * self._discount * next_q
            for (reward, next_q, end) in zip(rewards, qnext, done)
        ]
        return q_targets

    def _train_critic(self, states, actions, rewards, done, next_states):
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def experience(self, state, action, reward, done, next_state):
        # store in replay buffer
        self._memory.add(state, action, reward, done, next_state)

        self.train()
Esempio n. 16
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, agent_id):

        self.state_size  = state_size
        self.action_size = action_size
        self.seed        = args['seed']
        self.device      = args['device']
        #self.args        = args

        # Q-Network
        self.actor_network    = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_target     = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_optimizer  = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR'])
        
        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        #if not agent_id:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #else:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        
        # Replay memory
        self.memory      = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed)
        
        # Noise process
        self.noise       = OUNoise(action_size, self.seed)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step      = 0
        
        self.mCriticLoss = 0
        
        self.actorLoss   = 0
        
    
    def step(self, state, action, reward, next_state, done, mCritic):
        # Save experience in replay memory
        
        self.memory.add(state, action, reward, next_state, done)
        
        if len(self.memory) > args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences, mCritic)
            

    def act(self, current_state):
        
        with torch.no_grad():
                
            self.actor_network.eval()
            
            # PUT CONDITIONAL CHECK -> if CNN reshape, ELSE...dont
            #input_state = torch.from_numpy(current_state).float().reshape(args['reshape_size']).unsqueeze(0).unsqueeze(0).to(self.device)
            
            input_state = torch.from_numpy(current_state).float().to(self.device)
                
            with torch.no_grad():
                action  = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()
                
            #action     += self.noise.sample()
            
        return action

    def reset(self):
        self.noise.reset()
        
        
    def train(self, experiences, mCritic):
        
        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        
        # ---------------------------- update critic ---------------------------- #
        
        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next   = self.actor_target(next_states)
            
            #PUT CONDITIONAL CHECK: if CNN reshape ELSE..Dont...
            #Q_targets_next = mCritic.target(next_states, actions_next[np.newaxis, :])
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets      = rewards + (args['GAMMA'] * Q_targets_next * (1 - dones))
        
        
        # Compute critic loss
        Q_expected         = mCritic.network(states, actions)
        mCritic_loss       = F.mse_loss(Q_expected, Q_targets)
Esempio n. 17
0
class Agent():
    """ DDPG Agent, interacts with environment and learns from environment """
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)

    # think
    def act(self, states, add_noise=True):
        """ Decide what action to take next """

        # evaluate state through actor_local
        states = torch.from_numpy(states).float().to(self.DEVICE)
        actions = np.zeros((self.n_agents, self.action_size))

        self.actor_local.eval()  # put actor_local network in "evaluation" mode
        with torch.no_grad():
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # put actor_local back into "training" mode

        # add noise for better performance
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    # embody
    def step(self, t, s, a, r, s_, done):
        """ Commit step into the brain """

        # Save SARS' to replay buffer --- state-action-reward-next_state tuple
        for n in range(self.n_agents):
            # self.memory.add(s, a, r, s_, done)

            # print ("going to learn 10 times")

            self.memory.add(s[n], a[n], r[n], s_[n], done[n])

        if t % self.LEARN_INTERVAL != 0:
            return

        # Learn (if enough samples are available in memory        )
        if len(self.memory) > self.BATCH_SIZE:
            # print ("going to learn 10 times")
            for _ in range(self.LEARN_NUM):
                experiences = self.memory.sample()  # get a memory sample
                self.learn(experiences, self.GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Learn from experiences, with discount factor gamma
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ------ Update Critic ------ #

        # get predicted next-state actions and Q values from target networks
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------ Update Actor ------ #

        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------ Update Target Networks ------ #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        # keep count of steps taken
        # self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 18
0
def train(sess, env, actor, critic):
    # Set up summary ops
    summary_ops, summary_vars = build_summaries()

    # Initialize Tensorflow variables
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        episode_reward = 0
        episode_ave_max_q = 0

        noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA,
                                          MAX_STEPS_EPISODE)
        noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME)

        for j in xrange(MAX_STEPS_EPISODE):

            if RENDER_ENV:
                env.render()

            # Add exploratory noise according to Ornstein-Uhlenbeck process to action
            # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps
            if i < EXPLORATION_TIME:
                a = actor.predict(
                    np.reshape(s,
                               (1, env.observation_space.shape[0]))) + noise[j]
            else:
                a = actor.predict(
                    np.reshape(s, (1, env.observation_space.shape[0])))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, actor.state_dim),
                              np.reshape(a, actor.action_dim), r, terminal,
                              np.reshape(s2, actor.state_dim))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    # If state is terminal assign reward only
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # Else assgin reward + net target Q
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = \
                    critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                episode_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                a_grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, a_grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            episode_reward += r

            if terminal or j == MAX_STEPS_EPISODE - 1:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: episode_reward,
                                           summary_vars[1]: episode_ave_max_q
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \
                      '| Qmax: %.4f' % (episode_ave_max_q / float(j))

                break
class Agent():
    """Interacts with and learns from the environment"""

    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subnet and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
          state (array_like): current state
          eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Set qnetwork_local to evaluation mode
        self.qnetwork_local.eval()

        # This operation should not be included in gradient calculation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Set back qnetwork_local to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
          experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
          gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q tagets for current states with actual rewards
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update the target network -----
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Args:
          local_model (torch.nn.Module): weights will be copied from
          target_model (torch.nn.MOdule): weights will be copied to
          tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
Esempio n. 20
0
class DDPG():
    """ Deep Deterministic Policy Gradient Model """

    def __init__(self, state_size, action_size, random_seed):
        """ Initialize the model with arguments as follows:
                
                    ARGUMENTS
                    =========
                        - state_size (int) = dimension of input space
                        - action_size (int) = dimension of action space
                        - random_seed (int) = random seed

                    Returns 
                    =======
                        - best learned action to take after Actor-Critic Learning
         """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # create noise
        self.noise = OUNoise(action_size, random_seed)
                
        # create memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
                


        # Actor Networks (local online net + target net)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR)

        # Critic Networks (local online net + target net)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
        # instantiate online and target networks with same weights
        self.hard_update(self.actor_local, self.actor_target,)
        self.hard_update(self.critic_local, self.critic_target)
    
    
    def hard_update(self, local, target):
        for local_param, target_param in zip(local.parameters(), target.parameters()):
            target_param.data.copy_(local_param.data)
                
                
    def act(self, state, add_noise=True):
        """ Choose an action while interacting and learning in the environment """

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

    
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)


    def soft_update(self, local_model, target_model, tau):
        # Perform soft update of the target networks
        # at every time step, keep 1-tau of target network
        # and add only a small fraction (tau) of the current online networks
        # to prevent oszillation
        for local_param, target_param in zip(local_model.parameters(), target_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def step(self, state, action, reward, next_state, done):
        # at every iteration, add new SARS' trajectory to memory, then learn from batches 
        # if learning_step is reached and enough samples are in the buffer
        
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
Esempio n. 21
0
class DDQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 head_name="DuelingDQN",
                 head_scale="max"):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Assign model parameters and assign device
        model_params = [
            state_size, action_size, seed, hidden_layers, head_name, head_scale
        ]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)

        # Set up optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Initialize Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Update time step
        self.t_step = self.t_step + 1

        # Learn every self.update_every time steps.
        if self.t_step % self.update_every == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Go to evaluation mode and get Q values for current state
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # get back to train mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences

        # Go to evaluation mode
        self.qnetwork_target.eval()
        with torch.no_grad():
            # get Q values for the next state
            Q_dash_local = self.qnetwork_local(next_states)
            Q_dash_target = self.qnetwork_target(next_states)

            # Find the predicted action based on the local Q_network
            argmax_action = torch.max(Q_dash_local, dim=1, keepdim=True)[1]

            # Get the Q-value from the target network
            Q_dash_max = Q_dash_target.gather(1, argmax_action)

            # Update the target value
            y = rewards + gamma * Q_dash_max * (1 - dones)

        # Go back to train mode
        self.qnetwork_target.train()

        # Predict Q-values based on the local network
        self.optimizer.zero_grad()
        Q = self.qnetwork_local(states)
        y_pred = Q.gather(1, actions)

        # TD-error/loss function
        loss = torch.sum((y - y_pred)**2)

        # Optimize the network
        loss.backward()
        self.optimizer.step()

        # Update the target network using the local and target networks
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = ?*?_local + (1 - ?)*?_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent(object):
    """
    The Agent interacts with and learns from the environment.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)

    def hard_update(self, local_model, target_model):
        """
        Hard update model parameters.
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experiences in replay memory and use random sample from buffer to learn.
        """

        # Save experience / reward, cater for when multiples
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.params['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['GAMMA'])

    def act(self, states, add_noise=True):
        """
        Returns actions for a given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(self.params['DEVICE'])
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma=params['GAMMA']):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update Critic(Value)
        # Get predicted next-state actions and Q-Values from target Network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q Targe for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Stabilize learning per bernchmark guidelines
        self.critic_optimizer.step()

        # Update Actor (Policy)
        # Compute Actor Loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local,
                         self.critic_target,
                         tau=self.params['TAU'])
        self.soft_update(self.actor_local,
                         self.actor_target,
                         tau=self.params['TAU'])

    def soft_update(self, local_model, target_model, tau=params['TAU']):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 23
0
class Agent():
    """Code adapted from the Udacity course"""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max action from max Q values (for next states) from target model

        indexes_of_Q_local_for_next_states = self.qnetwork_local(
            next_states).detach().max(1)[1].unsqueeze(1)
        Q_target_for_next_states = self.qnetwork_target(next_states).detach()
        Q_thetas = Q_target_for_next_states.gather(
            1, indexes_of_Q_local_for_next_states)

        Q_targets = rewards + (gamma * Q_thetas * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        Polyak averaging
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPG_Agent():
    """Interacts with and learns from the environment."""
#self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic
    def __init__(self, state_size, action_size, num_agents, seed, device,
                 buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10,
                 gamma=0.99, tau=8e-3,
                 learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001,                
                 hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32],
                 add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500,
                 agent_id=-1):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate_X (float; optional): learning rate for X=actor or critic
        """
        print('In DPPG_AGENT: seed = ', seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.num_batches = num_batches
        
        self.gamma = gamma
        self.tau = tau
        
        self.lr_actor = learning_rate_actor
        self.lr_critic = learning_rate_critic
        self.weight_decay_critic = weight_decay
        
        self.add_noise = add_noise
        self.start_eps = start_eps
        self.eps = start_eps
        self.end_eps = end_eps
        self.eps_decay = 1/(end_eps_episode*num_batches)  # set decay rate based on epsilon end target
        self.timestep = 0
        
        self.agent_id = agent_id
     
        ### SET UP THE ACTOR NETWORK ###
        # Assign model parameters and assign device
        model_params_actor  = [state_size, action_size, seed, hidden_layers_actor]
        
        # Create the Actor Network (w/ Target Network)
        self.actor_local = Actor(*model_params_actor).to(self.device)
        self.actor_target = Actor(*model_params_actor).to(self.device)
        #print('actor_local network is: ', print(self.actor_local))
        
        # Set up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)       
        
        ### SET UP THE CRITIC NETWORK ###
        model_params_critic = [state_size, action_size, seed, hidden_layers_critic]

        # Create the Critic Network (w/ Target Network)
        self.critic_local = Critic(*model_params_critic).to(self.device)
        self.critic_target = Critic(*model_params_critic).to(self.device)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device)

    def step(self, states, actions, rewards, next_states, dones, agent_number):
        # Increment timestep by 1
        self.timestep += 1
        
        # Save experience in replay memory
        self.memory.add(states, actions, rewards, next_states, dones)
        
         # If there are enough samples and a model update is to be made at this time step
        if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0:
            # For each batch
            for i in range(self.num_batches):
                # Sample experiences from memory
                experiences = self.memory.sample()
        
                # Learn from the experience
                self.learn(experiences, self.gamma, agent_number)

    def act(self, state, scale_noise=True):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(self.device)
        
        # Go to evaluation mode and get Q values for current state
        self.actor_local.eval()
        with torch.no_grad():
            # Get action for the agent and concatenate them
            action = [self.actor_local(state[0]).cpu().data.numpy()]
            
        # get back to train mode
        self.actor_local.train()
        
        # Add noise to the action probabilities
        # Note, we want the magnitude of noise to decrease as the agent keeps learning
        action += int(scale_noise)*(self.eps)*self.noise.sample()
        
        return np.clip(action, -1.0, 1.0)
    
    def reset(self):
        """
        Reset the noise, and all neural network parameters for the current agent
        """
        self.noise.reset()
        self.eps = self.start_eps
        self.timestep = 0
        self.critic_local.reset_parameters()
        self.actor_local.reset_parameters()
        self.critic_target.reset_parameters()
        self.actor_target.reset_parameters()
        
        # ReSet up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)
        
        # Clear the experience buffer
        self.memory.clear_buffer()
        
    def reset_noise(self):
        """
        Reset the noise only
        """
        self.noise.reset()
   
    def learn(self, experiences, gamma, agent_number):
        ####     DRAW FROM MEMORY AND PREPARE SARS DATA        ####
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences
        
        # NOTE: actions has dimension of batch_size x concatenated action for all agents
      
        # get the next action for the current agent for the entire batch
        actions_next = self.actor_target(next_states)
    
        # Construct next action vector for the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:,2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:,:2], actions_next), dim=1)
        
        ####    UPDATE CRITIC   ####
        # Get predicted next-state actions and Q values from target models
        # Get the next targets
        Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        
        # Define the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        
        # Clip gradient @1
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
   

        # --------------UPDATE ACTOR -----------------------#
        # Compute actor loss
        actions_pred = self.actor_local(states)

        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1)
        
        # Calculate the loss. Note the negative sign since we use steepest ascent
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks using the local and target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, self.end_eps)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        X_target = tau*X_local + (1 - tau)*X_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Esempio n. 25
0
def main(test=False):
    try:
        # Pre proces ucenia
        if (test == False):
            # init wandb cloud
            wandb.init(project="dqn_maze")

            # hyperparametre
            wandb.config.batch_size = 32
            wandb.config.gamma = 0.98
            wandb.config.h1 = 128
            wandb.config.h2 = 128
            wandb.config.lr = 0.001
            wandb.config.tau = 0.01
            max_episodes = 5000
            max_steps = 100
        # Pre proces testovania
        else:
            max_episodes = 20
            max_steps = 100

            np.random.seed(99)

            # init file
            log_file = open("log/statistics.txt", "w")
            log_file.write("episode;score;step;time;apples;mines;end\n")

        if (test == False):
            a1 = Agent(26, 4, [wandb.config.h1, wandb.config.h2],
                       wandb.config.lr)
            a1.save_plot()
        else:
            a1 = Agent(fileName="model.h5")
            a1.remove_noise()

        # experiences replay buffer
        replay_buffer = ReplayBuffer()

        # generate env
        env1 = Prostredie(10, 10, [
            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
            0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
            0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
            0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
            1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 4, 0
        ])

        # Hlavny cyklus hry
        for episode in range(1, max_episodes + 1):
            start_time = time.time()

            state = env1.reset(testing=test)

            # reset score
            score, avg_loss = 0.0, 0.0

            for step in range(1, max_steps + 1):
                if test == True:
                    env1.render()
                    time.sleep(0.2)
                else:
                    # reset Q net's noise params
                    a1.reset_noise()

                # clovek
                #in_key = input()
                #if in_key == 'w':
                #    action = 1
                #elif in_key == 's':
                #    action = 0
                #elif in_key == 'a':
                #    action = 2
                #elif in_key == 'd':
                #    action = 3

                # nahodny agent
                #action = np.random.randint(0, 4)

                # neuronova siet
                action = np.argmax(a1.predict(state))

                next_state, reward, done, info = env1.step(action)

                score += reward

                if (test == False):
                    replay_buffer.add(
                        (state, action, reward, next_state, float(done)))

                    if len(replay_buffer.buffer) >= wandb.config.batch_size:
                        loss = a1.train(replay_buffer, wandb.config.batch_size,
                                        wandb.config.gamma, wandb.config.tau)
                        avg_loss += loss
                #else:
                #    print(f"stav: {state}")
                #    print(f"akcia: {action}")
                #    print(f"odmena: {reward}")
                #    print(f"done: {done}")
                #    print(f"step: {step}")
                #    print(f"replay_buffer_train: {len(replay_buffer.buffer)}")
                #    print(f"epoch: {episode}/{max_episodes}")
                #    print(f"score: {score}")
                #    print(f"apples: {info['apples']}/{env1.count_apple}")
                #    print(f"mines: {info['mines']}/{env1.count_mine}")

                # critical
                state = next_state

                if done == True:
                    break

            # statistics
            avg_loss /= step

            if (test == False):
                log_dict = {
                    'epoch': episode,
                    'score': score,
                    'steps': step,
                    'loss': avg_loss,
                    'replay_buffer': len(replay_buffer.buffer),
                    'time': time.time() - start_time,
                    'apple': (info['apples'] / env1.count_apple) * 100.0,
                    'mine': (info['mines'] / env1.count_mine) * 100.0,
                    'end': info['end'] * 100.0
                }

                wandb.log(log_dict)
            else:
                log_file.write(
                    f"{episode};{score};{step};{time.time()-start_time};{(info['apples'] / env1.count_apple) * 100.0};{(info['mines'] / env1.count_mine) * 100.0};{info['end'] * 100.0}\n"
                )

    except KeyboardInterrupt:
        print("Game terminated")
        sys.exit()
    finally:
        # Save model to file
        if (test == False):
            a1.model.save("model.h5")
        else:
            log_file.close()

        env1.f_startPosition.close()
        env1.f_apples.close()
        env1.f_mines.close()
Esempio n. 26
0
        action = env.action_space.sample()
    else:  # select an action from the actor network with noise
        action = policy.select_action(state, noise=True)

    # the agent plays the action
    next_state, reward, done, info = env.step(action)

    # add to the total episode reward
    episode_reward += reward

    # check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(
        done)

    # add to the memory buffer
    memory.add((state, next_state, action, reward, done_bool))

    # update the state, episode timestep and total timestep
    state = next_state
    episode_timesteps += 1
    total_timesteps += 1
    eval_counter += 1

    # train after the first episode
    if total_timesteps > start_timesteps:
        policy.train(memory)

# save the model
    if total_timesteps % save_freq == 0:
        policy.save(int(total_timesteps / save_freq))
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  # 0.0
        self.exploration_theta = 0.1  # 0.15
        self.exploration_sigma = 0.1  # 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def act_no_noise(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 28
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.eps = 3.0
        self.eps_decay = 0.9999

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size * 2, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size * 2, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise((1, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             agent_number,
             learn_iterations=5):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        #self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory
               ) > BATCH_SIZE:  #and self.timestep % LEARN_EVERY == 0:
            for _ in range(learn_iterations):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update epsilon
        self.eps *= self.eps_decay
        self.eps = max(self.eps, 1)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 29
0
class Agent():
    """ Interacts with and learns from then environment."""
    def __init__(self, state_size, action_size, seed, model=QNetwork):
        """Initialize an Agent object.
        
        Param
        =====
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (object): model to use
            
        Return
        ======
            None
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = model(state_size, action_size, seed).to(device)
        self.qnetwork_target = model(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=hyperparameters["lr"])

        # Replay memory
        self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"],
                                   hyperparameters["batch_size"], seed, device)
        # Initialize time step (for updating every hyperparameters["update_every"] steps)
        self.t_step = 0

        # Init tracking of params
        wandb.login()
        wandb.init(project=project_name, name=name, config=hyperparameters)
        jovian.log_hyperparams(hyperparameters)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every hyperparameters["update_every"] time steps.
        self.t_step = (self.t_step + 1) % hyperparameters["update_every"]
        if self.t_step == 0:
            # If enough samples are availble in memory, get random subset and learn
            if len(self.memory) > hyperparameters["batch_size"]:
                experiences = self.memory.sample()
                self.learn(experiences, hyperparameters["gamma"])

    def act(self, state, eps=0.):
        """Return actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for espilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        
        Params:
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', d) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ---------------- update target network ----------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target,
                         hyperparameters["tau"])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def get_model_name(self):
        return name

    def get_project_name(self):
        return project_name
Esempio n. 30
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.AE = Actor(state_dim,action_dim).cuda()
        self.CE = Critic(state_dim,action_dim).cuda()
        self.AT = Actor(state_dim,action_dim).cuda()
        self.CT = Critic(state_dim,action_dim).cuda()
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.time_step = 0

        self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl'))
        # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl'))
        # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))
        # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))

        self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4)
        self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4)

    def train(self):
        self.AE.train()
        data = self.replay_buffer.get_batch(BATCH_SIZE)
        bs =  np.array([da[0] for da in data])
        ba =  np.array([da[1] for da in data])
        br =  np.array([da[2] for da in data])
        bs_ = np.array([da[3] for da in data])
        bd =  np.array([da[4] for da in data])

        bs = torch.FloatTensor(bs).cuda()
        ba = torch.FloatTensor(ba).cuda()
        br = torch.FloatTensor(br).cuda()
        bs_ = torch.FloatTensor(bs_).cuda()

        a_ = self.AT(bs_)
        #######################  NOTICE !!! #####################################
        #q1  = self.CE(bs,  a)  ###### here use action batch !!! for policy loss!!!
        q2  = self.CE(bs, ba) ###### here use computed batch !!! for value loss!!!
        ########################################################################
        q_ = self.CT(bs_, a_).detach()
        q_tar = torch.FloatTensor(BATCH_SIZE)
        for i in range(len(data)):
            if bd[i]:
                q_tar[i] = br[i]
            else:
                q_tar[i] = br[i]+GAMMA*q_[i]

        q_tar = q_tar.view(BATCH_SIZE,1).cuda()
        # minimize mse_loss of q2 and q_tar
        td_error = F.mse_loss(q2, q_tar.detach())  # minimize td_error
        self.CE.zero_grad()
        td_error.backward(retain_graph=True)
        self.optimizer_c.step()

        a = self.AE(bs)
        q1 = self.CE(bs, a)

        a_loss = -torch.mean(q1) # maximize q
        self.AE.zero_grad()
        a_loss.backward(retain_graph=True)
        self.optimizer_a.step()

        self.soft_replace()

    def soft_replace(self):
        for t,e in zip(self.AT.parameters(),self.AE.parameters()):
            t.data = (1-TAU)*t.data + TAU*e.data
        for t,e in zip(self.CT.parameters(),self.CE.parameters()):
            t.data = (1-TAU)*t.data + TAU*e.data

    def action(self, state):
        self.AE.eval()
        state_tensor = torch.FloatTensor(state).unsqueeze(0).cuda()  # add batch_sz=1
        ac_tensor = self.AE(state_tensor)
        ac = ac_tensor.squeeze(0).cpu().detach().numpy()
        return ac
    
    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            torch.save(self.AE.state_dict(), MODEL_DIR + '/obs/actor_{}.pkl'.format(self.time_step))
            torch.save(self.CE.state_dict(), MODEL_DIR + '/obs/critic_{}.pkl'.format(self.time_step))
            print('Save model state_dict successfully in obs dir...')

        return self.time_step