Example #1
0
class DDPG_agent(nn.Module):
    def __init__(self, in_actor, in_critic, action_size, num_agents,
                 random_seed):
        super(DDPG_agent, self).__init__()
        """init the agent"""

        self.action_size = action_size
        self.seed = random_seed

        # Fully connected actor network
        self.actor_local = Actor(in_actor, self.action_size,
                                 self.seed).to(device)
        self.actor_target = Actor(in_actor, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Fully connected critic network
        self.critic_local = Critic(in_critic, num_agents * self.action_size,
                                   self.seed).to(device)
        self.critic_target = Critic(in_critic, num_agents * self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise process for exploration
        self.noise = OUNoise((action_size), random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        action = self.actor_target(state)
        return action

    def reset(self):
        """ Resets noise """
        self.noise.reset()
Example #2
0
class Agent():
    def __init__(self, actor_size, action_size, critic_size):
        super().__init__()
        gpu = torch.cuda.is_available()
        if (gpu):
            print('GPU/CUDA works! Happy fast training :)')
            torch.cuda.current_device()
            torch.cuda.empty_cache()
            self.device = torch.device("cuda")
        else:
            print('training on cpu...')
        self.device = torch.device("cpu")

        self.actor = Actor(actor_size, action_size).to(self.device)
        self.actor_target = Actor(actor_size, action_size).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic = Critic(critic_size).to(self.device)
        self.critic_target = Critic(critic_size).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=0.001,
                                       weight_decay=0)
        self.gamma = 0.95  #0.99
        self.tau = 0.001
        self.noise = OUNoise((action_size), 2)
        self.target_network_update(self.actor_target, self.actor, 1.0)
        self.target_network_update(self.critic_target, self.critic, 1.0)

    def select_actions(self, state):
        state = torch.from_numpy(state).float().to(self.device).view(1, -1)
        #print(state.shape)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.squeeze(0)
        self.actor.train()
        actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def target_network_update(self, target_network, network, tau):
        for network_param, target_param in zip(network.parameters(),
                                               target_network.parameters()):
            target_param.data.copy_(tau * network_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = [
            OUNoise(action_size, random_seed, sigma=0.1)
            for i in range(self.num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done,
                        self.num_agents)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(UPDATES_PER_STEP):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            for i in range(self.num_agents):
                agent_action = action[i]
                for j in agent_action:
                    j += self.noise[i].sample()

        return np.clip(action, -1, 1)

    def reset(self):
        for i in range(self.num_agents):
            self.noise[i].reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Example #4
0
class DyNODESacAgent(object):
    """DyNODE-SAC."""
    def __init__(self,
                 obs_shape,
                 action_shape,
                 device,
                 model_kind,
                 kind='D',
                 step_MVE=5,
                 hidden_dim=256,
                 discount=0.99,
                 init_temperature=0.01,
                 alpha_lr=1e-3,
                 alpha_beta=0.9,
                 actor_lr=1e-3,
                 actor_beta=0.9,
                 actor_log_std_min=-10,
                 actor_log_std_max=2,
                 critic_lr=1e-3,
                 critic_beta=0.9,
                 critic_tau=0.005,
                 critic_target_update_freq=2,
                 model_lr=1e-3,
                 log_interval=100):

        self.device = device
        self.discount = discount
        self.critic_tau = critic_tau
        self.critic_target_update_freq = critic_target_update_freq
        self.log_interval = log_interval
        self.step_MVE = step_MVE
        self.model_kind = model_kind

        self.actor = Actor(obs_shape, action_shape, hidden_dim,
                           actor_log_std_min, actor_log_std_max).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr,
                                                betas=(actor_beta, 0.999))

        self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device)
        self.critic_target = Critic(obs_shape, action_shape,
                                    hidden_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr,
                                                 betas=(critic_beta, 0.999))

        self.log_alpha = torch.tensor(np.log(init_temperature)).to(device)
        self.log_alpha.requires_grad = True
        self.target_entropy = -np.prod(
            action_shape)  # set target entropy to -|A|
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr,
                                                    betas=(alpha_beta, 0.999))

        if self.model_kind == 'dynode_model':
            self.model = DyNODE(obs_shape,
                                action_shape,
                                hidden_dim_p=200,
                                hidden_dim_r=200).to(device)
        elif self.model_kind == 'nn_model':
            self.model = NN_Model(obs_shape,
                                  action_shape,
                                  hidden_dim_p=200,
                                  hidden_dim_r=200,
                                  kind=kind).to(device)
        else:
            assert 'model is not supported'

        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=model_lr)

        self.train()
        self.critic_target.train()

    def train(self, training=True):
        self.training = training
        self.actor.train(training)
        self.critic.train(training)
        self.model.train(training)

    @property
    def alpha(self):
        return self.log_alpha.exp()

    def select_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device)
            obs = obs.unsqueeze(0)
            mu, _, _, _ = self.actor(obs,
                                     compute_pi=False,
                                     compute_log_pi=False)
            return mu.cpu().data.numpy().flatten()

    def sample_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device)
            obs = obs.unsqueeze(0)
            mu, pi, _, _ = self.actor(obs, compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()

    def update_model(self, replay_buffer, L, step):

        if self.model_kind == 'dynode_model':
            obs_m, action_m, reward_m, next_obs_m, _ = replay_buffer.sample_dynode(
            )
            transition_loss, reward_loss = self.model.loss(
                obs_m, action_m, reward_m, next_obs_m)
            model_loss = transition_loss + reward_loss
        elif self.model_kind == 'nn_model':
            obs, action, reward, next_obs, _ = replay_buffer.sample()
            transition_loss, reward_loss = self.model.loss(
                obs, action, reward, next_obs)
            model_loss = transition_loss + reward_loss
        else:
            assert 'model is not supported'

        # Optimize the Model
        self.model_optimizer.zero_grad()
        model_loss.backward()
        self.model_optimizer.step()

        if step % self.log_interval == 0:
            L.log('train/model_loss', model_loss, step)

    def MVE_prediction(self, replay_buffer, L, step):

        obs, action, reward, next_obs, not_done = replay_buffer.sample()

        trajectory = []
        next_ob = next_obs
        with torch.no_grad():
            while len(trajectory) < self.step_MVE:
                ob = next_ob
                _, act, _, _ = self.actor(ob)
                rew, next_ob = self.model(ob, act)
                trajectory.append([ob, act, rew, next_ob])

            _, next_action, log_pi, _ = self.actor(next_ob)
            target_Q1, target_Q2 = self.critic_target(next_ob, next_action)
            ret = torch.min(target_Q1,
                            target_Q2) - self.alpha.detach() * log_pi

        critic_loss = 0
        for ob, act, rew, _ in reversed(trajectory):
            current_Q1, current_Q2 = self.critic(ob, act)
            ret = rew + self.discount * ret
            # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean()
            critic_loss = critic_loss + F.mse_loss(
                current_Q1, ret) + F.mse_loss(current_Q2, ret)
        current_Q1, current_Q2 = self.critic(obs, action)
        ret = reward + self.discount * ret
        # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean()
        critic_loss = critic_loss + F.mse_loss(current_Q1, ret) + F.mse_loss(
            current_Q2, ret)
        critic_loss = critic_loss / (self.step_MVE + 1)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # actor
        _, pi, log_pi, log_std = self.actor(obs)
        actor_Q1, actor_Q2 = self.critic(obs.detach(), pi)
        actor_Q = torch.min(actor_Q1, actor_Q2)
        actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean()

        # optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.log_alpha_optimizer.zero_grad()
        alpha_loss = (self.alpha *
                      (-log_pi - self.target_entropy).detach()).mean()
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

    def update_critic(self, obs, action, reward, next_obs, not_done, L, step):
        with torch.no_grad():
            _, policy_action, log_pi, _ = self.actor(next_obs)
            target_Q1, target_Q2 = self.critic_target(next_obs, policy_action)
            target_V = torch.min(target_Q1,
                                 target_Q2) - self.alpha.detach() * log_pi
            target_Q = reward + (not_done * self.discount * target_V)

        # get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)
        if step % self.log_interval == 0:
            L.log('train_critic/loss', critic_loss, step)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        self.critic.log(L, step)

    def update_actor_and_alpha(self, obs, L, step):
        _, pi, log_pi, log_std = self.actor(obs)
        actor_Q1, actor_Q2 = self.critic(obs, pi)

        actor_Q = torch.min(actor_Q1, actor_Q2)
        actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean()

        if step % self.log_interval == 0:
            L.log('train_actor/loss', actor_loss, step)
            L.log('train_actor/target_entropy', self.target_entropy, step)
        entropy = 0.5 * log_std.shape[1] * (
            1.0 + np.log(2 * np.pi)) + log_std.sum(dim=-1)
        if step % self.log_interval == 0:
            L.log('train_actor/entropy', entropy.mean(), step)

        # optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.actor.log(L, step)

        self.log_alpha_optimizer.zero_grad()
        alpha_loss = (self.alpha *
                      (-log_pi - self.target_entropy).detach()).mean()
        if step % self.log_interval == 0:
            L.log('train_alpha/loss', alpha_loss, step)
            L.log('train_alpha/value', self.alpha, step)
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

    def update(self, replay_buffer, L, step):

        if step < 2000:
            for _ in range(2):
                obs, action, reward, next_obs, not_done = replay_buffer.sample(
                )
                self.update_critic(obs, action, reward, next_obs, not_done, L,
                                   step)
                self.update_actor_and_alpha(obs, L, step)

            if step % self.log_interval == 0:
                L.log('train/batch_reward', reward.mean(), step)

        else:
            obs, action, reward, next_obs, not_done = replay_buffer.sample()

            if step % self.log_interval == 0:
                L.log('train/batch_reward', reward.mean(), step)

            self.MVE_prediction(replay_buffer, L, step)
            self.update_critic(obs, action, reward, next_obs, not_done, L,
                               step)
            self.update_actor_and_alpha(obs, L, step)

        if step % self.critic_target_update_freq == 0:
            utils.soft_update_params(self.critic.Q1, self.critic_target.Q1,
                                     self.critic_tau)
            utils.soft_update_params(self.critic.Q2, self.critic_target.Q2,
                                     self.critic_tau)

    def save(self, model_dir, step):
        torch.save(self.actor.state_dict(),
                   '%s/actor_%s.pt' % (model_dir, step))
        torch.save(self.critic.state_dict(),
                   '%s/critic_%s.pt' % (model_dir, step))

    def save_model(self, model_dir, step):
        torch.save(self.model.state_dict(),
                   '%s/model_%s.pt' % (model_dir, step))

    def load(self, model_dir, step):
        self.actor.load_state_dict(
            torch.load('%s/actor_%s.pt' % (model_dir, step)))
        self.critic.load_state_dict(
            torch.load('%s/critic_%s.pt' % (model_dir, step)))
Example #5
0
class DDPG:
    def __init__(self,
                 state_size,
                 action_size,
                 memory_size=int(1e5), # replay buffer size
                 batch_size=128,       # minibatch size
                 gamma=0.99,           # discount factor
                 tau=1e-3,           # for soft update of target parameters
                 update_every=10,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 random_seed=2):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "gamma": gamma,
                       "tau": tau,
                       "memory_size": memory_size,
                       "batch_size": batch_size,
                       "optimizer": "adam"}

        self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)

        self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        self.learn_steps = 0
        self.update_every = update_every

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        # for single agent only
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()  # must set to eval mode, since BatchNorm used
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action.squeeze(), -1, 1)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.params["batch_size"]:
            experiences = self.memory.sample()
            self.learn(experiences, self.params["gamma"])

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ------------------------------------------
        # update critic
        # ------------------------------------------
        # recall DQN
        # Q[s][a] = Q[s][a] + alpha * (r + gamma * np.max(Q[s_next]) - Q[s][a])
        # thus, here
        # Q_local = Q[s][a]
        #         = critic_local(s, a)
        # Q_target = r + gamma * np.max(Q[s_next])
        #          = r + gamma * (critic_target[s_next, actor_target(s_next)])
        #
        # calculate np.max(Q[s_next]) with critic_target[s_next, actor_target(s_next)]
        # because actor suppose to output action which max Q(s)
        #
        # loss = mse(Q_local - Q_target)
        best_actions = self.actor_target(next_states)  # supposed to be best actions, however
        Q_next_max = self.critic_target(next_states, best_actions)
        Q_target = rewards + gamma * Q_next_max * (1 - dones)
        # Q_target_detached = Q_target.detach()

        Q_local = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_local, Q_target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ------------------------------------------
        # update critic
        # ------------------------------------------
        # suppose critic(s,a) give us q_max as a baseline or guidance
        # we want actor(s) to output the right a
        # which let critic(s,a)->q_max happen
        # so we want find a_actor to max Q_critic(s, a)
        # a_actor is function of θ
        # so the gradient is dQ/da*da/dθ
        actions_pred = self.actor_local(states)
        Q_baseline = self.critic_local(states, actions_pred)
        actor_loss = -Q_baseline.mean()  # I think this is a good trick to make loss to scalar

        # note, gradients from both actor_local and critic_local will be calculated
        # however we only update actor_local
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        if self.learn_steps % self.update_every == 0:
            self.soft_update(self.critic_local, self.critic_target, self.params["tau"])
            self.soft_update(self.actor_local, self.actor_target, self.params["tau"])

        self.learn_steps += 1

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Example #6
0
class Agent():
    """ Interacts with and learns from the environment. """
    def __init__(self, state_size, action_size, fc1_units, fc2_units):
        """Initialize an Agent object.

        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = torch.manual_seed(SEED)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, fc1_units,
                                 fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, fc1_units,
                                  fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, fc1_units,
                                   fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, fc1_units,
                                    fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OrnsteinUhlenbeck(action_size, SEED)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED,
                                   device)

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn only every N_TIME_STEPS
        if time_step % N_TIME_STEPS != 0:
            return

        # Learn if enough samples are available in replay buffer
        if len(self.memory) > BATCH_SIZE:
            for i in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """ Returns actions for given state as per current policy. """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets from current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
        local_model: PyTorch model (weights will be copied from)
        target_model: PyTorch model (weights will be copied to)
        tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def store(self):
        torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth')

    def load(self):
        if os.path.isfile('checkpoint_actor.pth') and os.path.isfile(
                'checkpoint_critic.pth'):
            print("=> loading checkpoints for Actor and Critic... ")
            self.actor_local.load_state_dict('checkpoint_actor')
            self.critic_local.load_state_dict('checkpoint_critic')
            print("done !")
        else:
            print("no checkpoints found for Actor and Critic...")
Example #7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, num_agents, state_size, action_size, random_seed=2018):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.device = torch.device('cuda' if cuda else 'cpu')

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)


#        # Learn, if enough samples are available in memory
#        if len(self.memory) > BATCH_SIZE:
#            experiences = self.memory.sample()
#            self.learn(experiences, GAMMA)

    def sampleandlearn(self):
        ''' Learn from stored experiences '''
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        # Deactivate gradients and perform forward pass
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            for a in range(self.num_agents):
                action[a] += self.noise.sample()
        # Clip action
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #8
0
class Agent():
    ''' Interacts with and learns from the environment '''
    def __init__(self, num_agents, state_size, action_size, random_seed=2018):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.device = torch.device('cuda' if cuda else 'cpu')

        self.update = UPDATE_EVERY
        self.updates = NUMBER_OF_UPDATES

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)

    def step(self, state, action, reward, next_state, done, timestep):
        ''' Save experience in replay memory, and use random sample from buffer to learn '''

        # Save experience into memory __for each agent__
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # If we are in the timestep to update
        if timestep % self.update == 0:

            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:

                # Do learning "updates" times
                for _ in range(self.updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        ''' Returns actions for given state as per current policy '''
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))

        # Deactivate gradients and perform forward pass
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()

        if add_noise:
            for a in range(self.num_agents):
                actions[a, :] += self.noise.sample()
        # Clip action
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        ''' 
        Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        '''

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        # Get predicted next-state actions and Q values from target models  # Dimensions
        actions_next = self.actor_target(next_states)  # (BSx2)
        Q_targets_next = self.critic_target(next_states, actions_next)  #

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(
            states, actions_pred).mean()  # Average over the minibatch

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        ''' Soft update model parameters '''
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #9
0
class DDPG():
    def __init__(self,
                 env,
                 log_dir,
                 gamma=0.99,
                 batch_size=64,
                 sigma=0.2,
                 batch_norm=True,
                 merge_layer=2,
                 buffer_size=int(1e6),
                 buffer_min=int(1e4),
                 tau=1e-3,
                 Q_wd=1e-2,
                 num_episodes=1000):

        self.s_dim = env.reset().shape[0]
        # self.a_dim = env.action_space.shape[0]
        self.a_dim = env.action_space2.shape[0]
        # self.a_dim = 1

        self.env = env
        # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm)
        self.mu = Actor(self.s_dim,
                        self.a_dim,
                        env.action_space2,
                        batch_norm=batch_norm)
        self.Q = Critic(self.s_dim,
                        self.a_dim,
                        batch_norm=batch_norm,
                        merge_layer=merge_layer)
        self.targ_mu = copy.deepcopy(self.mu).eval()
        self.targ_Q = copy.deepcopy(self.Q).eval()
        self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim),
                                       sigma=sigma * torch.ones(self.a_dim))
        self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim)
        self.buffer_min = buffer_min
        self.mse_fn = torch.nn.MSELoss()
        self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=1e-3,
                                            weight_decay=Q_wd)

        self.gamma = gamma
        self.batch_size = batch_size
        self.num_episodes = num_episodes
        self.tau = tau
        self.log_dir = log_dir

        self.fill_buffer()

    #updates the target network to slowly track the main network
    def track_network(self, target, main):
        with torch.no_grad():
            for pt, pm in zip(target.parameters(), main.parameters()):
                pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data)

    # updates the target nets to slowly track the main ones
    def track_networks(self):
        self.track_network(self.targ_mu, self.mu)
        self.track_network(self.targ_Q, self.Q)

    def run_episode(self):
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        t = 0
        tot_r = 0
        while not done:

            self.mu = self.mu.eval()
            # a_ = torch.squeeze(self.mu(s)).detach().numpy()
            a = torch.squeeze(self.mu(s)).detach().numpy()
            # print("a {}\n".format(a))

            self.mu = self.mu.train()

            ac_noise = self.noise().detach().numpy()
            a = a + ac_noise
            # print("ac_noise {}\n".format(ac_noise))
            # print("a+ac_noise {}\n".format(a))

            if a < self.env.action_space2.low:
                a = self.env.action_space2.low
            elif a > self.env.action_space2.high:
                a = self.env.action_space2.high

            s = s.detach().numpy()

            a_updated = self.LQR(s, a)
            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            tot_r += r
            self.buffer.add_tuple(s, a, r, s_p, done)

            s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample(
                batch_size=self.batch_size)

            # update critic
            with torch.no_grad():
                q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch))
                q_p_pred = torch.squeeze(q_p_pred)
                y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred
            self.Q_optimizer.zero_grad()
            q_pred = self.Q(s_batch, a_batch)
            q_pred = torch.squeeze(q_pred)
            #print(torch.mean(q_pred))
            Q_loss = self.mse_fn(q_pred, y)
            Q_loss.backward(retain_graph=False)
            self.Q_optimizer.step()

            # update actor
            self.mu_optimizer.zero_grad()
            q_pred_mu = self.Q(s_batch, self.mu(s_batch))
            q_pred_mu = torch.squeeze(q_pred_mu)
            #print(torch.mean(q_pred_mu))
            mu_loss = -torch.mean(q_pred_mu)
            # print(mu_loss)
            mu_loss.backward(retain_graph=False)
            #print(torch.sum(self.mu.layers[0].weight.grad))
            self.mu_optimizer.step()
            self.track_networks()

            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
            t += 1
        return tot_r, t

    def train(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models/model_' + str(i))
        np.save(self.log_dir + '/results_train.npy', np.array(results))

    def train1(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models1/model_' + str(i))
        np.save(self.log_dir + '/results_train1.npy', np.array(results))

    def train2(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models2/model_' + str(i))
        np.save(self.log_dir + '/results_train2.npy', np.array(results))

    def train3(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models3/model_' + str(i))
        np.save(self.log_dir + '/results_train3.npy', np.array(results))

    def eval_all(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval.npy', np.array(results))

    def eval_all1(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval1.npy', np.array(results))

    def eval_all2(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval2.npy', np.array(results))

    def eval_all3(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval3.npy', np.array(results))

    def eval(self, num_eps=10, mu=None):
        if mu == None:
            mu = self.mu

        results = []
        mu = mu.eval()
        for i in range(num_eps):
            r, t = self.run_eval_episode(mu=mu)
            results.append([r, t])
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
        return np.mean(results, axis=0)

    def run_eval_episode(self, mu=None):
        if mu == None:
            mu = self.mu
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        tot_r = t = 0
        while not done:
            a = mu(s).view(-1).detach().numpy()

            a_updated = self.LQR(s, a)
            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            tot_r += r
            t += 1
            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
        return tot_r, t

    def LQR(self, s, a):

        FPS = 50
        SCALE = 30.0  # affects how fast-paced the game is, forces should be adjusted as well
        VIEWPORT_W = 600
        VIEWPORT_H = 400

        gravity = 9.8 / FPS / FPS  # gravity is enhanced by scaling
        thrust_main_max = gravity / 0.56
        thrust_side_max = thrust_main_max * 0.095 / 0.7  # m/frame^2 # determined by test
        m_main_inv = thrust_main_max  # gravity*0.57
        m_side_inv = thrust_side_max  # gravity*0.225
        a_i_inv = 0.198 / 100  # rad/frame^2 # determined by test # not depend on SCALE
        align = 0.87  # 0.87 = sin30

        # target point set
        x_target = 0
        y_target = 0  # the landing point is 0
        Vx_target = 0
        Vy_target = 0
        theta_target = 0
        omega_target = 0

        if a < self.env.action_space2.low:
            a = self.env.action_space2.low
        elif a > self.env.action_space2.high:
            a = self.env.action_space2.high

        a_float = float(a)
        y_target = s[1] * (VIEWPORT_H / SCALE /
                           2) / a_float  # 1.6 succeeds all the times

        X = np.array([ \
        [s[0]*(VIEWPORT_W/SCALE/2)-x_target], \
        [s[1]*(VIEWPORT_H/SCALE/2)-y_target], \
        [s[2]/(VIEWPORT_W/SCALE/2)-Vx_target], \
        [s[3]/(VIEWPORT_H/SCALE/2)-Vy_target], \
        [s[4]-theta_target], \
        [s[5]/20.0-omega_target]])

        # print("X {}\n".format(X))

        A = np.array([ \
        [0, 0, 1, 0, 0, 0], \
        [0, 0, 0, 1, 0, 0], \
        [0, 0, 0, 0, -1*gravity, 0], \
        [0, 0, 0, 0, 0, 0], \
        [0, 0, 0, 0, 0, 1], \
        [0, 0, 0, 0, 0, 0]])

        B = np.array([ \
        [0, 0], \
        [0, 0], \
        [0, m_side_inv*align], \
        [1*m_main_inv, 0], \
        [0, 0], \
        [0, -1*a_i_inv]])

        sigma = np.array([ \
        [0], \
        [0], \
        [0], \
        [-1*gravity], \
        [0], \
        [0]])

        # gravity compensation
        BTB = np.dot(B.T, B)
        u_sigma = -1 * np.linalg.inv(BTB).dot(B.T).dot(sigma)
        # print("u_sigma {}\n".format(u_sigma))

        # Design of LQR
        # Solve Riccati equation to find a optimal control input
        R = np.array([ \
        [1, 0], \
        [0, 1]])

        Q = np.array([ \
        [1, 0, 0, 0, 0, 0], \
        [0, 1, 0, 0, 0, 0], \
        [0, 0, 1, 0, 0, 0], \
        [0, 0, 0, 1, 0, 0], \
        [0, 0, 0, 0, 100, 0], \
        [0, 0, 0, 0, 0, 100]])

        # Solving Riccati equation
        P = sp.linalg.solve_continuous_are(A, B, Q, R)
        # print("P {}\n".format(P))

        # u = -KX
        # K = R-1*Rt*P
        K = np.linalg.inv(R).dot(B.T).dot(P)
        thrust = -1 * np.dot(K, X) + u_sigma

        BK = np.dot(B, K)
        A_ = A - BK
        a_eig = np.linalg.eig(A_)
        a_sort = np.sort(a_eig[0])
        # print("eigen values {}\n".format(a_sort))

        # print("thrust {}\n".format(thrust))
        # thrust[0] = 0
        # thrust[1] = 1

        if s[1] < 0.3 / SCALE:
            thrust[0] = 0
            thrust[1] = 0

        # conversion to compensate main thruster's tricky thrusting
        thrust[0] = thrust[0] / 0.5 - 1.0

        if self.env.continuous:
            a_updated = np.array([thrust[0], thrust[1]])
            # print("a_updated {}\n".format(a_updated))
            # a = (0.5, 0)
            a_updated = np.clip(
                a_updated, -1,
                +1)  #  if the value is less than 0.5, it's ignored
            # print("a_updated * {}\n".format(a_updated))
        else:
            print("please change to cts mode")

        return a_updated

    def fill_buffer(self):
        print('Filling buffer')
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)

        temp_number = 0

        while self.buffer.size < self.buffer_min:

            # self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32)
            a = np.random.uniform(self.env.action_space2.low,
                                  self.env.action_space2.high,
                                  size=(self.a_dim))
            a_updated = self.LQR(s, a)

            if temp_number < 3:
                print("a {}\n".format(a), "actions:",
                      "{} {}".format(a_updated[0], a_updated[1]))
                # print("a_updated*** {}\n".format(a_updated))
                temp_number += 1

            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            if done:
                self.env.reset()

            self.buffer.add_tuple(s, a, r, s_p, done)
            s = s_p
Example #10
0
class Agent():
    def __init__(self, state_size, action_size):
        super().__init__()
        gpu = torch.cuda.is_available()
        if (gpu):
            print('GPU/CUDA works! Happy fast training :)')
            torch.cuda.current_device()
            torch.cuda.empty_cache()
            self.device = torch.device("cuda")
        else:
            print('training on cpu...')
            self.device = torch.device("cpu")

        self.actor = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=0.001,
                                       weight_decay=0)
        self.replay_buffer = deque(maxlen=1000000)  #1m
        self.gamma = 0.95  #0.99
        self.batch_size = 128
        self.tau = 0.001
        self.seed = random.seed(2)
        self.noise = OUNoise((20, action_size), 2)
        self.target_network_update(self.actor_target, self.actor, 1.0)
        self.target_network_update(self.critic_target, self.critic, 1.0)

    def select_actions(self, state):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.numpy()
        self.actor.train()
        actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def add(self, sars):
        self.replay_buffer.append(sars)

    def train(self):
        if (len(self.replay_buffer) > self.batch_size):
            states, actions, rewards, next_states, dones = self.sample()
            next_actions = self.actor_target(next_states)
            next_state_q_v = self.critic_target(next_states, next_actions)
            #print(next_state_q_v)
            q_targets = rewards + (self.gamma * next_state_q_v * (1 - dones))
            current_q_v = self.critic(states, actions)
            critic_loss = F.mse_loss(current_q_v, q_targets)
            self.critic_optim.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm(self.critic.parameters(), 1)
            self.critic_optim.step()

            actions = self.actor(states)
            actor_loss = -self.critic(states, actions).mean()
            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()
            self.target_network_update(self.actor_target, self.actor, self.tau)
            self.target_network_update(self.critic_target, self.critic,
                                       self.tau)

    def target_network_update(self, target_network, network, tau):
        for network_param, target_param in zip(network.parameters(),
                                               target_network.parameters()):
            target_param.data.copy_(tau * network_param.data +
                                    (1.0 - tau) * target_param.data)

    def sample(self):
        samples = random.sample(self.replay_buffer, k=self.batch_size)
        states = torch.tensor([s[0] for s in samples]).float().to(self.device)
        actions = torch.tensor([s[1] for s in samples]).float().to(self.device)
        rewards = torch.tensor([s[2] for s in samples
                                ]).float().unsqueeze(1).to(self.device)
        next_states = torch.tensor([s[3]
                                    for s in samples]).float().to(self.device)
        dones = torch.tensor([s[4] for s in samples
                              ]).float().unsqueeze(1).to(self.device)
        return states, actions, rewards, next_states, dones
class DDPG():
    """ This is an Individual DDPG Agent """

    def __init__(self, state_size, action_size, seed):
        """ Initialize a DDPG Agent Object
        :param state_size: dimension of state (input) for this decentralized actor
        :param action_size: dimension of action (output) for this decentralized actor
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Hyperparameters
        self.buffer_size = 100000
        self.batch_size = 256
        self.gamma = 0.99
        self.tau = 0.01
        self.lr_actor = 0.0001
        self.lr_critic = 0.001

        # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,  self.seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic)

        # Initialize local and taret networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, self.seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def act(self, state, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.
        :param state: observations for this individual agent
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise and epsilon > np.random.random():
            actions += self.noise.sample()
        return np.clip(actions, -1,1)

    def step(self):
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target
        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Example #12
0
class A2C():
    def __init__(self, state_dim, action_dim, action_lim, update_type='soft',
                lr_actor=1e-4, lr_critic=1e-3, tau=1e-3,
                mem_size=1e6, batch_size=256, gamma=0.99,
                other_cars=False, ego_dim=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available()
                                        else "cpu")

        self.joint_model = False
        if len(state_dim) == 3:
            self.model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor)

            self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.target_model.load_state_dict(self.model.state_dict())

            self.model.to(self.device)
            self.target_model.to(self.device)

            self.joint_model = True
        else:
            self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor)
            self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_actor.load_state_dict(self.actor.state_dict())
            self.target_actor.eval()

            self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2)
            self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_critic.load_state_dict(self.critic.state_dict())
            self.target_critic.eval()

            self.actor.to(self.device)
            self.target_actor.to(self.device)
            self.critic.to(self.device)
            self.target_critic.to(self.device)

        self.action_lim = action_lim
        self.tau = tau # hard update if tau is None
        self.update_type = update_type
        self.batch_size = batch_size
        self.gamma = gamma

        if self.joint_model:
            mem_size = mem_size//100
        self.memory = Memory(int(mem_size), action_dim, state_dim)

        mu = np.zeros(action_dim)
        sigma = np.array([0.5, 0.05])
        self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma)
        self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma)

        self.initialised = True
        self.training = False

    def select_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(np.expand_dims(obs, axis=0)).to(self.device)
            if self.joint_model:
                action, _ = self.model(obs)
                action = action.data.cpu().numpy().flatten()
            else:
                action = self.actor(obs).data.cpu().numpy().flatten()

        if self.training:
            action += self.noise()
            return action
        else:
            return action

    def append(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)

    def reset_noise(self):
        self.noise.reset()
        self.target_noise.reset()

    def train(self):
        if self.joint_model:
            self.model.train()
            self.target_model.train()
        else:
            self.actor.train()
            self.target_actor.train()
            self.critic.train()
            self.target_critic.train()

        self.training = True

    def eval(self):
        if self.joint_model:
            self.model.eval()
            self.target_model.eval()
        else:
            self.actor.eval()
            self.target_actor.eval()
            self.critic.eval()
            self.target_critic.eval()

        self.training = False

    def save(self, folder, episode, previous=None, solved=False):
        filename = lambda type, ep : folder + '%s' % type + \
                                    (not solved) * ('_ep%d' % (ep)) + \
                                    (solved * '_solved') + '.pth'

        if self.joint_model:
            torch.save(self.model.state_dict(), filename('model', episode))
            torch.save(self.target_model.state_dict(), filename('target_model', episode))
        else:
            torch.save(self.actor.state_dict(), filename('actor', episode))
            torch.save(self.target_actor.state_dict(), filename('target_actor', episode))

            torch.save(self.critic.state_dict(), filename('critic', episode))
            torch.save(self.target_critic.state_dict(), filename('target_critic', episode))

        if previous is not None and previous > 0:
            if self.joint_model:
                os.remove(filename('model', previous))
                os.remove(filename('target_model', previous))
            else:
                os.remove(filename('actor', previous))
                os.remove(filename('target_actor', previous))
                os.remove(filename('critic', previous))
                os.remove(filename('target_critic', previous))

    def load_actor(self, actor_filepath):
        qualifier = '_' + actor_filepath.split("_")[-1]
        folder = actor_filepath[:actor_filepath.rfind("/")+1]
        filename = lambda type : folder + '%s' % type + qualifier

        if self.joint_model:
            self.model.load_state_dict(torch.load(filename('model'),
                                                    map_location=self.device))
            self.target_model.load_state_dict(torch.load(filename('target_model'),
                                                    map_location=self.device))
        else:
            self.actor.load_state_dict(torch.load(filename('actor'),
                                                    map_location=self.device))
            self.target_actor.load_state_dict(torch.load(filename('target_actor'),
                                                    map_location=self.device))

    def load_all(self, actor_filepath):
        self.load_actor(actor_filepath)
        qualifier = '_' + actor_filepath.split("_")[-1]
        folder = actor_filepath[:actor_filepath.rfind("/")+1]
        filename = lambda type : folder + '%s' % type + qualifier

        if not self.joint_model:
            self.critic.load_state_dict(torch.load(filename('critic'),
                                                    map_location=self.device))
            self.target_critic.load_state_dict(torch.load(filename('target_critic'),
                                                    map_location=self.device))

    def update(self, target_noise=True):
        try:
            minibatch = self.memory.sample(self.batch_size) # dict of ndarrays
        except ValueError as e:
            print('Replay memory not big enough. Continue.')
            return None, None

        states = Variable(torch.FloatTensor(minibatch['obs0'])).to(self.device)
        actions = Variable(torch.FloatTensor(minibatch['actions'])).to(self.device)
        rewards = Variable(torch.FloatTensor(minibatch['rewards'])).to(self.device)
        next_states = Variable(torch.FloatTensor(minibatch['obs1'])).to(self.device)
        terminals = Variable(torch.FloatTensor(minibatch['terminals1'])).to(self.device)

        if self.joint_model:
            target_actions, _ = self.target_model(next_states)
            if target_noise:
                for sample in range(target_actions.shape[0]):
                    target_actions[sample] += self.target_noise()
                    target_actions[sample].clamp(-self.action_lim, self.action_lim)
            _, target_qvals = self.target_model(next_states, target_actions=target_actions)
            y = rewards + self.gamma * (1 - terminals) * target_qvals

            _, model_qvals = self.model(states, target_actions=actions)
            value_loss = F.mse_loss(y, model_qvals)
            model_actions, _ = self.model(states)
            _, model_qvals = self.model(states, target_actions=model_actions)
            action_loss = -model_qvals.mean()

            self.model_optim.zero_grad()
            (value_loss + action_loss).backward()
            self.model_optim.step()
        else:
            target_actions = self.target_actor(next_states)
            if target_noise:
                for sample in range(target_actions.shape[0]):
                    target_actions[sample] += self.target_noise()
                    target_actions[sample].clamp(-self.action_lim, self.action_lim)
            target_critic_qvals = self.target_critic(next_states, target_actions)
            y = rewards + self.gamma * (1 - terminals) * target_critic_qvals

            # optimise critic
            critic_qvals = self.critic(states, actions)
            value_loss = F.mse_loss(y, critic_qvals)
            self.critic_optim.zero_grad()
            value_loss.backward()
            self.critic_optim.step()

            # optimise actor
            action_loss = -self.critic(states, self.actor(states)).mean()
            self.actor_optim.zero_grad()
            action_loss.backward()
            self.actor_optim.step()

        # optimise target networks
        if self.update_type == 'soft':
            if self.joint_model:
                soft_update(self.target_model, self.model, self.tau)
            else:
                soft_update(self.target_actor, self.actor, self.tau)
                soft_update(self.target_critic, self.critic, self.tau)
        else:
            if self.joint_model:
                hard_update(self.target_model, self.model)
            else:
                hard_update(self.target_actor, self.actor)
                hard_update(self.target_critic, self.critic)

        return action_loss.item(), value_loss.item()
Example #13
0
class DDPG():
    def __init__(self,
                 env,
                 log_dir,
                 gamma=0.99,
                 batch_size=64,
                 sigma=0.2,
                 batch_norm=True,
                 merge_layer=2,
                 buffer_size=int(1e6),
                 buffer_min=int(1e4),
                 tau=1e-3,
                 Q_wd=1e-2,
                 num_episodes=1000):

        self.s_dim = env.reset().shape[0]
        self.a_dim = env.action_space.shape[0]

        self.env = env
        self.mu = Actor(self.s_dim,
                        self.a_dim,
                        env.action_space,
                        batch_norm=batch_norm)
        self.Q = Critic(self.s_dim,
                        self.a_dim,
                        batch_norm=batch_norm,
                        merge_layer=merge_layer)
        self.targ_mu = copy.deepcopy(self.mu).eval()
        self.targ_Q = copy.deepcopy(self.Q).eval()
        self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim),
                                       sigma=sigma * torch.ones(self.a_dim))
        self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim)
        self.buffer_min = buffer_min
        self.mse_fn = torch.nn.MSELoss()
        self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=1e-3,
                                            weight_decay=Q_wd)

        self.gamma = gamma
        self.batch_size = batch_size
        self.num_episodes = num_episodes
        self.tau = tau
        self.log_dir = log_dir

        self.fill_buffer()

    #updates the target network to slowly track the main network
    def track_network(self, target, main):
        with torch.no_grad():
            for pt, pm in zip(target.parameters(), main.parameters()):
                pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data)

    # updates the target nets to slowly track the main ones
    def track_networks(self):
        self.track_network(self.targ_mu, self.mu)
        self.track_network(self.targ_Q, self.Q)

    def run_episode(self):
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        t = 0
        tot_r = 0
        while not done:

            self.mu = self.mu.eval()
            a = torch.squeeze(self.mu(s)).detach().numpy()
            self.mu = self.mu.train()

            ac_noise = self.noise().detach().numpy()
            a = a + ac_noise

            s = s.detach().numpy()
            s_p, r, done, _ = self.env.step(a)
            tot_r += r
            self.buffer.add_tuple(s, a, r, s_p, done)

            s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample(
                batch_size=self.batch_size)

            # update critic
            with torch.no_grad():
                q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch))
                q_p_pred = torch.squeeze(q_p_pred)
                y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred
            self.Q_optimizer.zero_grad()
            q_pred = self.Q(s_batch, a_batch)
            q_pred = torch.squeeze(q_pred)
            #print(torch.mean(q_pred))
            Q_loss = self.mse_fn(q_pred, y)
            Q_loss.backward(retain_graph=False)
            self.Q_optimizer.step()

            # update actor
            self.mu_optimizer.zero_grad()
            q_pred_mu = self.Q(s_batch, self.mu(s_batch))
            q_pred_mu = torch.squeeze(q_pred_mu)
            #print(torch.mean(q_pred_mu))
            mu_loss = -torch.mean(q_pred_mu)
            # print(mu_loss)
            mu_loss.backward(retain_graph=False)
            #print(torch.sum(self.mu.layers[0].weight.grad))
            self.mu_optimizer.step()
            self.track_networks()

            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
            t += 1
        return tot_r, t

    def train(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models/model_' + str(i))
        np.save(self.log_dir + '/results_train.npy', np.array(results))

    def train1(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models1/model_' + str(i))
        np.save(self.log_dir + '/results_train1.npy', np.array(results))

    def train2(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models2/model_' + str(i))
        np.save(self.log_dir + '/results_train2.npy', np.array(results))

    def train3(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models3/model_' + str(i))
        np.save(self.log_dir + '/results_train3.npy', np.array(results))

    def eval_all(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval.npy', np.array(results))

    def eval(self, num_eps=10, mu=None):
        if mu == None:
            mu = self.mu

        results = []
        mu = mu.eval()
        for i in range(num_eps):
            r, t = self.run_eval_episode(mu=mu)
            results.append([r, t])
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
        return np.mean(results, axis=0)

    def run_eval_episode(self, mu=None):
        if mu == None:
            mu = self.mu
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        tot_r = t = 0
        while not done:
            a = mu(s).view(-1).detach().numpy()

            s_p, r, done, _ = self.env.step(a)
            tot_r += r
            t += 1
            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
        return tot_r, t

    def fill_buffer(self):
        print('Filling buffer')
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        while self.buffer.size < self.buffer_min:
            a = np.random.uniform(self.env.action_space.low,
                                  self.env.action_space.high,
                                  size=(self.a_dim))

            s_p, r, done, _ = self.env.step(a)
            if done:
                self.env.reset()
            self.buffer.add_tuple(s, a, r, s_p, done)
            s = s_p
Example #14
0
score = 0
steps = 0
noise_std = args.noise_std_start

for i in range(args.episodes):
    env_info = env.reset(train_mode=True)[brain_name]
    state = torch.from_numpy(
        env_info.vector_observations).view(-1).float().to(device)
    for t in range(args.max_t):
        with torch.no_grad():
            actor.eval()
            action = torch.clamp(
                actor_target(state.unsqueeze(0)) + torch.zeros(
                    (1, action_size * 2)).normal_(0, noise_std).to(device), -1,
                1).squeeze().float()  #+ ou_process.sample()
            actor.train()
            env_info = env.step(
                torch.stack(
                    (action[:action_size],
                     action[action_size:])).to('cpu').numpy())[brain_name]
            next_state = torch.from_numpy(
                env_info.vector_observations).view(-1).float()
            reward = torch.tensor(env_info.rewards).sum().float()
            score += reward.item()
            done = torch.tensor(env_info.local_done[0]
                                or env_info.local_done[1]).float()
            replay_buffer.push(state.to('cpu'), action.to('cpu'), next_state,
                               reward, done)
        if ((steps + 1) % args.n_steps == 0
                and len(replay_buffer) >= args.batch_size):
            for iteration in range(args.iterations):
class DDPG():
    """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def train(self,
              env,
              brain_name,
              num_episodes=200,
              max_time=1000,
              print_every=10):
        """ Interacts with and learns from a given Unity Environment

        :param env: Unity Environment the agents is trying to learn
        :param brain_name: Brain for Environment
        :param num_episodes: Number of episodes to train
        :param max_time: How long each episode runs for
        :param print_every: How often in episodes to print a running average
        :return: Returns episodes scores and 100 episode averages as lists
        """
        # --------- Set Everything up --------#
        scores = []
        avg_scores = []
        scores_deque = deque(maxlen=print_every)

        # -------- Simulation Loop --------#
        for episode_num in range(1, num_episodes + 1):
            # Reset everything
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            episode_scores = np.zeros(self.num_agents)
            self.reset_noise()
            # Run the episode
            for t in range(max_time):
                actions = self.act(states, self.epsilon)
                env_info = env.step(actions)[brain_name]
                next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
                self.step(states, actions, rewards, next_states, dones)
                episode_scores += rewards
                states = next_states
                if np.any(dones):
                    break

            # -------- Episode Finished ---------#
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min)
            scores.append(np.mean(episode_scores))
            scores_deque.append(np.mean(episode_scores))
            avg_scores.append(np.mean(scores_deque))
            if episode_num % print_every == 0:
                print(
                    f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}'
                )
                torch.save(
                    self.actor_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
                torch.save(
                    self.critic_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')

        # -------- All Episodes finished Save parameters and scores --------#
        # Save Model Parameters
        torch.save(self.actor_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
        torch.save(self.critic_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')
        # Save mean score per episode (of the 20 agents)
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w')
        scores_string = "\n".join([str(score) for score in scores])
        f.write(scores_string)
        f.close()
        # Save average scores for 100 window average
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w')
        avgScores_string = "\n".join([str(score) for score in avg_scores])
        f.write(avgScores_string)
        f.close()
        return scores, avg_scores

    def step(self, states, actions, rewards, next_states, dones):
        """ what the agent needs to do for every time step that occurs in the environment. Takes
        in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not
        the same as a step in the environment. Step is only called once per environment time step.

        :param states: array of states agent used to select actions
        :param actions: array of actions taken by agents
        :param rewards: array of rewards for last action taken in environment
        :param next_states: array of next states after actions were taken
        :param dones: array of bools representing if environment is finished or not
        """
        # Save experienced in replay memory
        for agent_num in range(self.num_agents):
            self.memory.add(states[agent_num], actions[agent_num],
                            rewards[agent_num], next_states[agent_num],
                            dones[agent_num])

        # Learn "num_updates" times every "update_every" time step
        self.t_step += 1
        if len(self.memory
               ) > self.batch_size and self.t_step % self.update_every == 0:
            self.t_step = 0
            for _ in range(self.num_updates):
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, states, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.

        :param states: array of states from the environment
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_local.eval()  # Sets to eval mode (no gradients)
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()  # Sets to train mode (gradients back on)
        if add_noise and epsilon > np.random.random():
            actions += [self.noise.sample() for _ in range(self.num_agents)]
        return np.clip(actions, -1, 1)

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)