Exemple #1
0
class DDPG:
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,  # e.g. = n_agent * (state_size + action_size)
                 lr_actor=1e-4,
                 lr_critic=1e-3,  # better learn faster than actor
                 random_seed=2):
        self.state_size = in_actor
        self.action_size = out_actor
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "optimizer": "adam"}

        self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor)

        # for a single agent, critic takes global observations as input, and output action-value Q
        # e.g. global_states = all_states + all_actions
        self.local_critic = Critic(in_shape=in_critic).to(device)
        self.target_critic = Critic(in_shape=in_critic).to(device)
        self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic)

        # Q: should local/target start with same weights ? synchronized after first copy after all
        # A: better hard copy at the beginning
        hard_update_A_from_B(self.target_actor, self.local_actor)
        hard_update_A_from_B(self.target_critic, self.local_critic)

        # Noise process
        self.noise = OUNoise(out_actor, scale=1.0)

    def act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # debug noise
        # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device)
        # action = self.local_actor(obs) + noise
        action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def target_act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device)
        # action = self.target_actor(obs) + noise_scale * noise
        action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def reset(self):
        self.noise.reset()
Exemple #2
0
class DDPG_agent(nn.Module):
    def __init__(self, in_actor, in_critic, action_size, num_agents,
                 random_seed):
        super(DDPG_agent, self).__init__()
        """init the agent"""

        self.action_size = action_size
        self.seed = random_seed

        # Fully connected actor network
        self.actor_local = Actor(in_actor, self.action_size,
                                 self.seed).to(device)
        self.actor_target = Actor(in_actor, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Fully connected critic network
        self.critic_local = Critic(in_critic, num_agents * self.action_size,
                                   self.seed).to(device)
        self.critic_target = Critic(in_critic, num_agents * self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise process for exploration
        self.noise = OUNoise((action_size), random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        action = self.actor_target(state)
        return action

    def reset(self):
        """ Resets noise """
        self.noise.reset()
Exemple #3
0
class Agent():
    def __init__(self, actor_size, action_size, critic_size):
        super().__init__()
        gpu = torch.cuda.is_available()
        if (gpu):
            print('GPU/CUDA works! Happy fast training :)')
            torch.cuda.current_device()
            torch.cuda.empty_cache()
            self.device = torch.device("cuda")
        else:
            print('training on cpu...')
        self.device = torch.device("cpu")

        self.actor = Actor(actor_size, action_size).to(self.device)
        self.actor_target = Actor(actor_size, action_size).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic = Critic(critic_size).to(self.device)
        self.critic_target = Critic(critic_size).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=0.001,
                                       weight_decay=0)
        self.gamma = 0.95  #0.99
        self.tau = 0.001
        self.noise = OUNoise((action_size), 2)
        self.target_network_update(self.actor_target, self.actor, 1.0)
        self.target_network_update(self.critic_target, self.critic, 1.0)

    def select_actions(self, state):
        state = torch.from_numpy(state).float().to(self.device).view(1, -1)
        #print(state.shape)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.squeeze(0)
        self.actor.train()
        actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def target_network_update(self, target_network, network, tau):
        for network_param, target_param in zip(network.parameters(),
                                               target_network.parameters()):
            target_param.data.copy_(tau * network_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #4
0
class PPO():
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)
        self.optimizer = torch.optim.Adam(
            itertools.chain(self.actor.parameters(), self.critic.parameters()),
            LR)

    def _calc_loss(self, state, action, old_log_prob, expected_values, gae):
        new_log_prob, action_distr = self.actor.compute_proba(state, action)
        state_values = self.critic.get_value(state).squeeze(1)

        critic_loss = ((expected_values - state_values)**2).mean()

        unclipped_ratio = torch.exp(new_log_prob - old_log_prob)
        clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP)
        actor_loss = -torch.min(clipped_ratio * gae,
                                unclipped_ratio * gae).mean()

        entropy_loss = -action_distr.entropy().mean()

        return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF

    def update(self, trajectories):
        trajectories = map(self._compute_lambda_returns_and_gae, trajectories)
        transitions = sum(
            trajectories,
            [])  # Turn a list of trajectories into list of transitions

        state, action, old_log_prob, target_value, advantage = zip(
            *transitions)
        state = np.array(state)
        action = np.array(action)
        old_log_prob = np.array(old_log_prob)
        target_value = np.array(target_value)
        advantage = np.array(advantage)
        advnatage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)

        for _ in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions),
                                    BATCH_SIZE)  # Choose random batch
            s = torch.from_numpy(state[idx]).float()
            a = torch.from_numpy(action[idx]).float()
            op = torch.from_numpy(old_log_prob[idx]).float(
            )  # Log probability of the action in state s.t. old policy
            v = torch.from_numpy(
                target_value[idx]).float()  # Estimated by lambda-returns
            adv = torch.from_numpy(advantage[idx]).float(
            )  # Estimated by generalized advantage estimation

            loss = self._calc_loss(s, a, op, v, adv)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def _compute_lambda_returns_and_gae(self, trajectory):
        lambda_returns = []
        gae = []
        last_lr = 0.
        last_v = 0.
        for s, _, r, _ in reversed(trajectory):
            ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
            last_lr = ret
            last_v = self.get_value(s)
            lambda_returns.append(last_lr)
            gae.append(last_lr - last_v)

        # Each transition contains state, action, old action probability, value estimation and advantage estimation
        return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip(
            trajectory, reversed(lambda_returns), reversed(gae))]

    def get_value(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0)
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0)
            action, pure_action, log_prob = self.actor.act(state)
        return action.cpu().numpy()[0], pure_action.cpu().numpy(
        )[0], log_prob.cpu().item()

    def save(self):
        torch.save(self.actor, "agent.pkl")
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, num_agents, state_size, action_size, random_seed=2018):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.device = torch.device('cuda' if cuda else 'cpu')

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)


#        # Learn, if enough samples are available in memory
#        if len(self.memory) > BATCH_SIZE:
#            experiences = self.memory.sample()
#            self.learn(experiences, GAMMA)

    def sampleandlearn(self):
        ''' Learn from stored experiences '''
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        # Deactivate gradients and perform forward pass
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            for a in range(self.num_agents):
                action[a] += self.noise.sample()
        # Clip action
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #6
0
states = env_info.vector_observations
state_size = states.shape[1]

vis = Visdom()
win_score = None
win_actor_score = None
win_critic_loss = None

actor = Actor(state_size * 2, action_size * 2).to(device)
actor_target = Actor(state_size * 2, action_size * 2).to(device)
critic = Critic(state_size * 2, n_action=action_size * 2).to(device)
critic_target = Critic(state_size * 2, n_action=action_size * 2).to(device)
for target_param, param in zip(critic_target.parameters(),
                               critic.parameters()):
    target_param.data.copy_(param.data)
for target_param, param in zip(actor_target.parameters(), actor.parameters()):
    target_param.data.copy_(param.data)
replay_buffer = ReplayMemory(args.replay_capacity)
criterion = nn.MSELoss()
optim_critic = torch.optim.Adam(critic.parameters(),
                                lr=args.lr_critic,
                                weight_decay=args.weight_decay_critic)
optim_actor = torch.optim.Adam(actor.parameters(), lr=args.lr_actor)

loss_critic = []
score_actor = []
score = 0
steps = 0
noise_std = args.noise_std_start

for i in range(args.episodes):
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = [
            OUNoise(action_size, random_seed, sigma=0.1)
            for i in range(self.num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done,
                        self.num_agents)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(UPDATES_PER_STEP):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            for i in range(self.num_agents):
                agent_action = action[i]
                for j in agent_action:
                    j += self.noise[i].sample()

        return np.clip(action, -1, 1)

    def reset(self):
        for i in range(self.num_agents):
            self.noise[i].reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Exemple #8
0
class PPO():
    def __init__(self, state_dim, action_dim, device):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device

        self.actor = Actor(state_dim, action_dim).to(device)
        self.critic = Critic(state_dim).to(device)
        self.optimizer = torch.optim.Adam(
            itertools.chain(self.actor.parameters(), self.critic.parameters()),
            LR)

        self.philosophers = list()
        for i in range(P_COUNT):
            self.philosophers.append(Critic(state_dim).to(device))

        self.p_optimizers = [
            torch.optim.Adam(p.parameters(), lr=P_LR)
            for p in self.philosophers
        ]
        self.update_cnt = 0

    def _calc_loss(self, state, action, old_log_prob, expected_values, gae):
        new_log_prob, action_distr = self.actor.compute_proba(state, action)
        state_values = self.critic.get_value(state).squeeze(1)

        critic_loss = ((expected_values - state_values)**2).mean()

        unclipped_ratio = torch.exp(new_log_prob - old_log_prob)
        clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP)
        actor_loss = -torch.min(clipped_ratio * gae,
                                unclipped_ratio * gae).mean()

        entropy_loss = -action_distr.entropy().mean()

        p_loss = 0
        for p in self.philosophers:
            p_state_values = self.critic.get_value(state).squeeze(1)
            p_loss += ((p_state_values - state_values.detach())**2).mean()

        return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF + p_loss

    def update(self, trajectories):
        trajectories = map(self._compute_lambda_returns_and_gae, trajectories)
        transitions = sum(
            trajectories,
            [])  # Turn a list of trajectories into list of transitions

        state, action, old_log_prob, target_value, advantage = zip(
            *transitions)
        state = torch.from_numpy(np.array(state)).float().to(self.device)
        action = torch.from_numpy(np.array(action)).float().to(self.device)
        old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to(
            self.device)
        target_value = torch.from_numpy(np.array(target_value)).float().to(
            self.device)
        advantage = torch.from_numpy(np.array(advantage)).float().to(
            self.device)

        for _ in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions), BATCH_SIZE)
            loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx],
                                   target_value[idx], advantage[idx])

            self.optimizer.zero_grad()
            for p_optimizer in self.p_optimizers:
                p_optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            for p_optimizer in self.p_optimizers:
                p_optimizer.step()

        self.update_cnt += 1
        if self.update_cnt % P_DELAY == 0:
            self.critic = self.philosophers[0]
            self.optimizer = self.p_optimizers[0]

            self.philosophers.pop(0)
            self.philosophers.append(Critic(self.state_dim).to(self.device))
            self.p_optimizers.pop(0)
            self.p_optimizers.append(
                torch.optim.Adam(self.philosophers[-1].parameters(), lr=P_LR))

    def _compute_lambda_returns_and_gae(self, trajectory):
        lambda_returns = []
        gae = []
        last_lr = 0.
        last_v = 0.
        for s, _, r, _ in reversed(trajectory):
            ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
            last_lr = ret
            last_v = self.get_value(s)
            lambda_returns.append(last_lr)
            gae.append(last_lr - last_v)

        # Each transition contains state, action, old action probability, value estimation and advantage estimation
        return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip(
            trajectory, reversed(lambda_returns), reversed(gae))]

    def get_value(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0).to(
                self.device)
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0).to(
                self.device)
            action, pure_action, log_prob = self.actor.act(state)
        return action.cpu().numpy()[0], pure_action.cpu().numpy(
        )[0], log_prob.cpu().item()

    def save(self):
        torch.save(self.actor, "agent.pkl")
Exemple #9
0
class PPO():
    def __init__(self, state_dim, action_dim, num_shared, device):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device

        self.actor = Actor(state_dim, action_dim, num_shared).to(device)
        self.critic = Critic(state_dim, num_shared).to(device)

    def parameters(self):
        return itertools.chain(self.actor.parameters(), self.critic.parameters())

    def first_parameters(self):
        return itertools.chain(self.actor.first_parameters(), self.critic.first_parameters())

    def shared_parameters(self):
        return itertools.chain(self.actor.shared_parameters(), self.critic.shared_parameters())

    def rest_parameters(self):
        return itertools.chain(self.actor.rest_parameters(), self.critic.rest_parameters())

    def _calc_loss(self, state, action, old_log_prob, expected_values, gae):
        new_log_prob, action_distr = self.actor.compute_proba(state, action)
        state_values = self.critic.get_value(state).squeeze(1)

        critic_loss = ((expected_values - state_values) ** 2).mean()

        unclipped_ratio = torch.exp(new_log_prob - old_log_prob)
        clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP)
        actor_loss = -torch.min(clipped_ratio * gae, unclipped_ratio * gae).mean()

        entropy_loss = -action_distr.entropy().mean()

        return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF


    def update(self, trajectories):
        trajectories = map(self._compute_lambda_returns_and_gae, trajectories)
        transitions = sum(trajectories, []) # Turn a list of trajectories into list of transitions

        state, action, old_log_prob, target_value, advantage = zip(*transitions)
        state = torch.from_numpy(np.array(state)).float().to(self.device)
        action = torch.from_numpy(np.array(action)).float().to(self.device)
        old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to(self.device)
        target_value = torch.from_numpy(np.array(target_value)).float().to(self.device)
        advantage = torch.from_numpy(np.array(advantage)).float().to(self.device)

        for _ in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions), BATCH_SIZE)
            loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx], target_value[idx], advantage[idx])

            # ugly code yeah =)
            # optimization outside
            yield loss


    def _compute_lambda_returns_and_gae(self, trajectory):
        lambda_returns = []
        gae = []
        last_lr = 0.
        last_v = 0.
        for s, _, r, _ in reversed(trajectory):
            ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
            last_lr = ret
            last_v = self.get_value(s)
            lambda_returns.append(last_lr)
            gae.append(last_lr - last_v)
        
        # Each transition contains state, action, old action probability, value estimation and advantage estimation
        return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip(trajectory, reversed(lambda_returns), reversed(gae))]
            
            
    def get_value(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
            action, pure_action, log_prob = self.actor.act(state)
        return action.cpu().numpy()[0], pure_action.cpu().numpy()[0], log_prob.cpu().item()

    def save(self):
        torch.save(self.actor, "agent.pkl")
Exemple #10
0
class DDPG:
    def __init__(self,
                 state_size,
                 action_size,
                 memory_size=int(1e5), # replay buffer size
                 batch_size=128,       # minibatch size
                 gamma=0.99,           # discount factor
                 tau=1e-3,           # for soft update of target parameters
                 update_every=10,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 random_seed=2):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "gamma": gamma,
                       "tau": tau,
                       "memory_size": memory_size,
                       "batch_size": batch_size,
                       "optimizer": "adam"}

        self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)

        self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        self.learn_steps = 0
        self.update_every = update_every

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        # for single agent only
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()  # must set to eval mode, since BatchNorm used
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action.squeeze(), -1, 1)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.params["batch_size"]:
            experiences = self.memory.sample()
            self.learn(experiences, self.params["gamma"])

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ------------------------------------------
        # update critic
        # ------------------------------------------
        # recall DQN
        # Q[s][a] = Q[s][a] + alpha * (r + gamma * np.max(Q[s_next]) - Q[s][a])
        # thus, here
        # Q_local = Q[s][a]
        #         = critic_local(s, a)
        # Q_target = r + gamma * np.max(Q[s_next])
        #          = r + gamma * (critic_target[s_next, actor_target(s_next)])
        #
        # calculate np.max(Q[s_next]) with critic_target[s_next, actor_target(s_next)]
        # because actor suppose to output action which max Q(s)
        #
        # loss = mse(Q_local - Q_target)
        best_actions = self.actor_target(next_states)  # supposed to be best actions, however
        Q_next_max = self.critic_target(next_states, best_actions)
        Q_target = rewards + gamma * Q_next_max * (1 - dones)
        # Q_target_detached = Q_target.detach()

        Q_local = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_local, Q_target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ------------------------------------------
        # update critic
        # ------------------------------------------
        # suppose critic(s,a) give us q_max as a baseline or guidance
        # we want actor(s) to output the right a
        # which let critic(s,a)->q_max happen
        # so we want find a_actor to max Q_critic(s, a)
        # a_actor is function of θ
        # so the gradient is dQ/da*da/dθ
        actions_pred = self.actor_local(states)
        Q_baseline = self.critic_local(states, actions_pred)
        actor_loss = -Q_baseline.mean()  # I think this is a good trick to make loss to scalar

        # note, gradients from both actor_local and critic_local will be calculated
        # however we only update actor_local
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        if self.learn_steps % self.update_every == 0:
            self.soft_update(self.critic_local, self.critic_target, self.params["tau"])
            self.soft_update(self.actor_local, self.actor_target, self.params["tau"])

        self.learn_steps += 1

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #11
0
class Agent:
    def __init__(self,
                 env,
                 hidden_size=256,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 gamma=0.99,
                 tau=1e-3,
                 max_memory=int(1e6)):
        obs = env.reset()
        self.num_states = obs['desired_goal'].shape[0] + obs[
            'observation'].shape[0]
        self.num_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.action_max = env.action_space.high[0]

        self.actor = Actor(self.num_states, hidden_size, self.num_actions)
        self.critic = Critic(self.num_states + self.num_actions, hidden_size,
                             1)

        self.target_actor = Actor(self.num_states, hidden_size,
                                  self.num_actions)
        self.target_critic = Critic(self.num_states + self.num_actions,
                                    hidden_size, 1)

        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.experience_replay = ExperienceReplay(max_memory)
        self.critic_loss_func = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)

    def get_action(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        action = self.actor.forward(state)
        action = action.detach().numpy()[0]
        return action

    def update(self, size):
        states, actions, rewards, next_states, _ = self.experience_replay.sample(
            size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        with torch.no_grad():
            next_actions = self.target_actor.forward(next_states)
            q_next = self.target_critic.forward(next_states,
                                                next_actions).detach()
            target_q = rewards.reshape((128, 1)) + self.gamma * q_next
            target_q = target_q.detach()
            c = 1 / (1 - self.gamma)
            target_q = torch.clamp(target_q, -c, 0)

        real_q = self.critic.forward(states, actions)
        dif = (target_q - real_q)
        critic_loss = dif.pow(2).mean()
        real_actions = self.actor.forward(states)
        actor_loss = -self.critic.forward(states, real_actions).mean()
        actor_loss += (real_actions / self.action_max).pow(2).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update target networks
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))
class DDPG():
    """ This is an Individual DDPG Agent """

    def __init__(self, state_size, action_size, seed):
        """ Initialize a DDPG Agent Object
        :param state_size: dimension of state (input) for this decentralized actor
        :param action_size: dimension of action (output) for this decentralized actor
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Hyperparameters
        self.buffer_size = 100000
        self.batch_size = 256
        self.gamma = 0.99
        self.tau = 0.01
        self.lr_actor = 0.0001
        self.lr_critic = 0.001

        # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,  self.seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic)

        # Initialize local and taret networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, self.seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def act(self, state, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.
        :param state: observations for this individual agent
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise and epsilon > np.random.random():
            actions += self.noise.sample()
        return np.clip(actions, -1,1)

    def step(self):
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target
        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #13
0
class A2C():
    def __init__(self, state_dim, action_dim, action_lim, update_type='soft',
                lr_actor=1e-4, lr_critic=1e-3, tau=1e-3,
                mem_size=1e6, batch_size=256, gamma=0.99,
                other_cars=False, ego_dim=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available()
                                        else "cpu")

        self.joint_model = False
        if len(state_dim) == 3:
            self.model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor)

            self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.target_model.load_state_dict(self.model.state_dict())

            self.model.to(self.device)
            self.target_model.to(self.device)

            self.joint_model = True
        else:
            self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor)
            self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_actor.load_state_dict(self.actor.state_dict())
            self.target_actor.eval()

            self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2)
            self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_critic.load_state_dict(self.critic.state_dict())
            self.target_critic.eval()

            self.actor.to(self.device)
            self.target_actor.to(self.device)
            self.critic.to(self.device)
            self.target_critic.to(self.device)

        self.action_lim = action_lim
        self.tau = tau # hard update if tau is None
        self.update_type = update_type
        self.batch_size = batch_size
        self.gamma = gamma

        if self.joint_model:
            mem_size = mem_size//100
        self.memory = Memory(int(mem_size), action_dim, state_dim)

        mu = np.zeros(action_dim)
        sigma = np.array([0.5, 0.05])
        self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma)
        self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma)

        self.initialised = True
        self.training = False

    def select_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(np.expand_dims(obs, axis=0)).to(self.device)
            if self.joint_model:
                action, _ = self.model(obs)
                action = action.data.cpu().numpy().flatten()
            else:
                action = self.actor(obs).data.cpu().numpy().flatten()

        if self.training:
            action += self.noise()
            return action
        else:
            return action

    def append(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)

    def reset_noise(self):
        self.noise.reset()
        self.target_noise.reset()

    def train(self):
        if self.joint_model:
            self.model.train()
            self.target_model.train()
        else:
            self.actor.train()
            self.target_actor.train()
            self.critic.train()
            self.target_critic.train()

        self.training = True

    def eval(self):
        if self.joint_model:
            self.model.eval()
            self.target_model.eval()
        else:
            self.actor.eval()
            self.target_actor.eval()
            self.critic.eval()
            self.target_critic.eval()

        self.training = False

    def save(self, folder, episode, previous=None, solved=False):
        filename = lambda type, ep : folder + '%s' % type + \
                                    (not solved) * ('_ep%d' % (ep)) + \
                                    (solved * '_solved') + '.pth'

        if self.joint_model:
            torch.save(self.model.state_dict(), filename('model', episode))
            torch.save(self.target_model.state_dict(), filename('target_model', episode))
        else:
            torch.save(self.actor.state_dict(), filename('actor', episode))
            torch.save(self.target_actor.state_dict(), filename('target_actor', episode))

            torch.save(self.critic.state_dict(), filename('critic', episode))
            torch.save(self.target_critic.state_dict(), filename('target_critic', episode))

        if previous is not None and previous > 0:
            if self.joint_model:
                os.remove(filename('model', previous))
                os.remove(filename('target_model', previous))
            else:
                os.remove(filename('actor', previous))
                os.remove(filename('target_actor', previous))
                os.remove(filename('critic', previous))
                os.remove(filename('target_critic', previous))

    def load_actor(self, actor_filepath):
        qualifier = '_' + actor_filepath.split("_")[-1]
        folder = actor_filepath[:actor_filepath.rfind("/")+1]
        filename = lambda type : folder + '%s' % type + qualifier

        if self.joint_model:
            self.model.load_state_dict(torch.load(filename('model'),
                                                    map_location=self.device))
            self.target_model.load_state_dict(torch.load(filename('target_model'),
                                                    map_location=self.device))
        else:
            self.actor.load_state_dict(torch.load(filename('actor'),
                                                    map_location=self.device))
            self.target_actor.load_state_dict(torch.load(filename('target_actor'),
                                                    map_location=self.device))

    def load_all(self, actor_filepath):
        self.load_actor(actor_filepath)
        qualifier = '_' + actor_filepath.split("_")[-1]
        folder = actor_filepath[:actor_filepath.rfind("/")+1]
        filename = lambda type : folder + '%s' % type + qualifier

        if not self.joint_model:
            self.critic.load_state_dict(torch.load(filename('critic'),
                                                    map_location=self.device))
            self.target_critic.load_state_dict(torch.load(filename('target_critic'),
                                                    map_location=self.device))

    def update(self, target_noise=True):
        try:
            minibatch = self.memory.sample(self.batch_size) # dict of ndarrays
        except ValueError as e:
            print('Replay memory not big enough. Continue.')
            return None, None

        states = Variable(torch.FloatTensor(minibatch['obs0'])).to(self.device)
        actions = Variable(torch.FloatTensor(minibatch['actions'])).to(self.device)
        rewards = Variable(torch.FloatTensor(minibatch['rewards'])).to(self.device)
        next_states = Variable(torch.FloatTensor(minibatch['obs1'])).to(self.device)
        terminals = Variable(torch.FloatTensor(minibatch['terminals1'])).to(self.device)

        if self.joint_model:
            target_actions, _ = self.target_model(next_states)
            if target_noise:
                for sample in range(target_actions.shape[0]):
                    target_actions[sample] += self.target_noise()
                    target_actions[sample].clamp(-self.action_lim, self.action_lim)
            _, target_qvals = self.target_model(next_states, target_actions=target_actions)
            y = rewards + self.gamma * (1 - terminals) * target_qvals

            _, model_qvals = self.model(states, target_actions=actions)
            value_loss = F.mse_loss(y, model_qvals)
            model_actions, _ = self.model(states)
            _, model_qvals = self.model(states, target_actions=model_actions)
            action_loss = -model_qvals.mean()

            self.model_optim.zero_grad()
            (value_loss + action_loss).backward()
            self.model_optim.step()
        else:
            target_actions = self.target_actor(next_states)
            if target_noise:
                for sample in range(target_actions.shape[0]):
                    target_actions[sample] += self.target_noise()
                    target_actions[sample].clamp(-self.action_lim, self.action_lim)
            target_critic_qvals = self.target_critic(next_states, target_actions)
            y = rewards + self.gamma * (1 - terminals) * target_critic_qvals

            # optimise critic
            critic_qvals = self.critic(states, actions)
            value_loss = F.mse_loss(y, critic_qvals)
            self.critic_optim.zero_grad()
            value_loss.backward()
            self.critic_optim.step()

            # optimise actor
            action_loss = -self.critic(states, self.actor(states)).mean()
            self.actor_optim.zero_grad()
            action_loss.backward()
            self.actor_optim.step()

        # optimise target networks
        if self.update_type == 'soft':
            if self.joint_model:
                soft_update(self.target_model, self.model, self.tau)
            else:
                soft_update(self.target_actor, self.actor, self.tau)
                soft_update(self.target_critic, self.critic, self.tau)
        else:
            if self.joint_model:
                hard_update(self.target_model, self.model)
            else:
                hard_update(self.target_actor, self.actor)
                hard_update(self.target_critic, self.critic)

        return action_loss.item(), value_loss.item()
Exemple #14
0
class DDPG():
    def __init__(self,
                 env,
                 log_dir,
                 gamma=0.99,
                 batch_size=64,
                 sigma=0.2,
                 batch_norm=True,
                 merge_layer=2,
                 buffer_size=int(1e6),
                 buffer_min=int(1e4),
                 tau=1e-3,
                 Q_wd=1e-2,
                 num_episodes=1000):

        self.s_dim = env.reset().shape[0]
        self.a_dim = env.action_space.shape[0]

        self.env = env
        self.mu = Actor(self.s_dim,
                        self.a_dim,
                        env.action_space,
                        batch_norm=batch_norm)
        self.Q = Critic(self.s_dim,
                        self.a_dim,
                        batch_norm=batch_norm,
                        merge_layer=merge_layer)
        self.targ_mu = copy.deepcopy(self.mu).eval()
        self.targ_Q = copy.deepcopy(self.Q).eval()
        self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim),
                                       sigma=sigma * torch.ones(self.a_dim))
        self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim)
        self.buffer_min = buffer_min
        self.mse_fn = torch.nn.MSELoss()
        self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=1e-3,
                                            weight_decay=Q_wd)

        self.gamma = gamma
        self.batch_size = batch_size
        self.num_episodes = num_episodes
        self.tau = tau
        self.log_dir = log_dir

        self.fill_buffer()

    #updates the target network to slowly track the main network
    def track_network(self, target, main):
        with torch.no_grad():
            for pt, pm in zip(target.parameters(), main.parameters()):
                pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data)

    # updates the target nets to slowly track the main ones
    def track_networks(self):
        self.track_network(self.targ_mu, self.mu)
        self.track_network(self.targ_Q, self.Q)

    def run_episode(self):
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        t = 0
        tot_r = 0
        while not done:

            self.mu = self.mu.eval()
            a = torch.squeeze(self.mu(s)).detach().numpy()
            self.mu = self.mu.train()

            ac_noise = self.noise().detach().numpy()
            a = a + ac_noise

            s = s.detach().numpy()
            s_p, r, done, _ = self.env.step(a)
            tot_r += r
            self.buffer.add_tuple(s, a, r, s_p, done)

            s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample(
                batch_size=self.batch_size)

            # update critic
            with torch.no_grad():
                q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch))
                q_p_pred = torch.squeeze(q_p_pred)
                y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred
            self.Q_optimizer.zero_grad()
            q_pred = self.Q(s_batch, a_batch)
            q_pred = torch.squeeze(q_pred)
            #print(torch.mean(q_pred))
            Q_loss = self.mse_fn(q_pred, y)
            Q_loss.backward(retain_graph=False)
            self.Q_optimizer.step()

            # update actor
            self.mu_optimizer.zero_grad()
            q_pred_mu = self.Q(s_batch, self.mu(s_batch))
            q_pred_mu = torch.squeeze(q_pred_mu)
            #print(torch.mean(q_pred_mu))
            mu_loss = -torch.mean(q_pred_mu)
            # print(mu_loss)
            mu_loss.backward(retain_graph=False)
            #print(torch.sum(self.mu.layers[0].weight.grad))
            self.mu_optimizer.step()
            self.track_networks()

            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
            t += 1
        return tot_r, t

    def train(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models/model_' + str(i))
        np.save(self.log_dir + '/results_train.npy', np.array(results))

    def train1(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models1/model_' + str(i))
        np.save(self.log_dir + '/results_train1.npy', np.array(results))

    def train2(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models2/model_' + str(i))
        np.save(self.log_dir + '/results_train2.npy', np.array(results))

    def train3(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 20 == 0:
                torch.save(self.mu, self.log_dir + '/models3/model_' + str(i))
        np.save(self.log_dir + '/results_train3.npy', np.array(results))

    def eval_all(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval.npy', np.array(results))

    def eval(self, num_eps=10, mu=None):
        if mu == None:
            mu = self.mu

        results = []
        mu = mu.eval()
        for i in range(num_eps):
            r, t = self.run_eval_episode(mu=mu)
            results.append([r, t])
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
        return np.mean(results, axis=0)

    def run_eval_episode(self, mu=None):
        if mu == None:
            mu = self.mu
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        tot_r = t = 0
        while not done:
            a = mu(s).view(-1).detach().numpy()

            s_p, r, done, _ = self.env.step(a)
            tot_r += r
            t += 1
            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
        return tot_r, t

    def fill_buffer(self):
        print('Filling buffer')
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        while self.buffer.size < self.buffer_min:
            a = np.random.uniform(self.env.action_space.low,
                                  self.env.action_space.high,
                                  size=(self.a_dim))

            s_p, r, done, _ = self.env.step(a)
            if done:
                self.env.reset()
            self.buffer.add_tuple(s, a, r, s_p, done)
            s = s_p
Exemple #15
0
class PPO(object):
    def __init__(self, args, env):

        self.learning_rate = args.learning_rate
        self.gamma = args.gamma
        self.lamb = args.lamb
        self.batch_size = args.batch_size
        self.step = 0
        self.epochs = args.epochs
 
        self.actor = Actor()
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate)

        self.critic = Critic()
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate)

        self.env = env
        self.num_actions = env.num_actions
        self.num_states = env.num_states

        self.data = {'step' : [], 'reward' : [], 'losses' : []}



    def train(self):

        with torch.no_grad(): #no-grad makes computation faster
            batch = {'s' : [], 'a' : [], 'r' : [], 'w' : [], 'V_target' : [], 'pi' : []}

            for i in range(self.batch_size):
                traj = {'s' : [], 'a' : [], 'r' : [], 'V' : [], 'pi' : []}
                s = self.env.reset()
                done = False
                while done == False:
                    (mu, std) = self.actor(torch.from_numpy(s))
                    dist = torch.distributions.normal.Normal(mu, std)
                    a = dist.sample().numpy()
                    s1, r, done = self.env.step(a)
                    V = self.critic(torch.from_numpy(s)).item()
                    traj['s'].append(s)
                    traj['a'].append(a)
                    traj['r'].append(r)
                    traj['V'].append(V)
                    traj['pi'].append(dist.log_prob(torch.tensor(a)))
                    s = s1

                traj_len = len(traj['r'])
                r = np.append(traj['r'], 0.)
                V = np.append(traj['V'], 0.)
                delta = r[:-1] + (self.gamma * V[1:]) - V[:-1]
                A = delta.copy()

                for t in reversed(range(traj_len - 1)):
                    A[t] = A[t] + (self.gamma * self.lamb * A[t + 1])

                for t in reversed(range(traj_len)):
                    V[t] = r[t] + (self.gamma * V[t + 1])

                V = V[:-1]

                batch['s'].extend(traj['s'])
                batch['a'].extend(traj['a'])
                batch['r'].extend(traj['r'])
                batch['w'].extend(A)
                batch['V_target'].extend(V)
                batch['pi'].extend(traj['pi'])

            batch['num_steps'] = len(batch['r'])
            batch['s'] = torch.tensor(batch['s'], requires_grad=False, dtype=torch.double)
            batch['a'] = torch.tensor(batch['a'], requires_grad=False, dtype=torch.double)
            batch['r'] = torch.tensor(batch['r'], requires_grad=False, dtype=torch.double)
            batch['w'] = torch.tensor(batch['w'], requires_grad=False, dtype=torch.double)
            batch['V_target'] = torch.tensor(batch['V_target'], requires_grad=False, dtype=torch.double)
            batch['pi'] = torch.tensor(batch['pi'], requires_grad=False, dtype=torch.double)


        with torch.no_grad():
            N = batch['r'].shape[0] / self.batch_size

        #optimize Actor network
        for actor_epoch in range(10):
            self.actor_optimizer.zero_grad()
            (mu, std) = self.actor(batch['s'])
            dist = torch.distributions.normal.Normal(mu, std)
            pi = dist.log_prob(batch['a']).sum(axis=-1)
            ratio = torch.exp(pi - batch['pi'])
            surrogate = ratio * batch['w']
            clipped = torch.clamp(ratio, min= 1 - 0.2, max = 1 + 0.2) * batch['w']
            loss = - torch.mean((torch.min(surrogate, clipped)))
            loss.backward()
            self.actor_optimizer.step()

        #optimize Critic network
        for critic_epoch in range(10):
            self.critic_optimizer.zero_grad()
            V = self.critic(batch['s'])
            loss = nn.MSELoss()(V.squeeze(1), batch['V_target'])
            loss.backward()
            self.critic_optimizer.step()
        self.data['losses'].append(loss.item())


        #logging
        self.step += batch['r'].shape[0]
        self.data['step'].append(self.step)
        self.data['reward'].append(batch['r'].mean() * N)


    def save_model(self):

        if not os.path.exists('./model_save/'):
            os.makedirs('./model_save/')

        save_dir = './model_save/' + str(self.epochs) + '_epochs.pt'

        torch.save({'data': self.data,
                    'actor': self.actor,
                    'actor_optim': self.actor_optimizer,
                    'critic_optim': self.critic_optimizer,
                    'critic': self.critic}, save_dir)

    
    def generate_results(self):

        #LOAD MODEL        
        if not os.path.isdir('./results/'):
            os.makedirs('./results/')

        model_steps = np.array(self.data['step'])
        model_rewards = np.array(self.data['reward'])
        model_losses = np.array(self.data['losses'])

        #LEARNING CURVES
        print('Generating Learning Curves...')
        fig = plt.figure()
        plt.plot(model_steps, model_rewards)
        plt.xlabel('Simulation Steps')
        plt.ylabel('Total Reward')
        plt.title('Actor Learning Curve')
        plt.savefig('./results/Actor_Learning_Curve_' + str(self.epochs) + '_Epochs.png')

        fig = plt.figure()
        plt.plot(model_steps, model_losses)
        plt.xlabel('Simulation Steps')
        plt.ylabel('Critic Loss')
        plt.title('Critic Learning Curve')
        plt.savefig('./results/Critic_Learning_Curve_' + str(self.epochs) + '_Epochs.png')
  

        #EXAMPLE TRAJECTORY
        print('Generating Example Trajectory...')
        s = self.env.reset()
        # Create dict to store data from simulation
        data = {
            't': [0],
            's': [s],
            'a': [],
            'r': [],
        }
        # Simulate until episode is done
        done = False
        while not done:
            (mu, std) = self.actor(torch.from_numpy(s))
            dist = torch.distributions.normal.Normal(mu, std)
            a = dist.sample().numpy()
            s, r, done = self.env.step(a)
            data['t'].append(data['t'][-1] + 1)
            data['s'].append(s)
            data['a'].append(a)
            data['r'].append(r)
        # Parse data from simulation
        data['s'] = np.array(data['s'])
        theta = data['s'][:, 0]
        thetadot = data['s'][:, 1]
        # Plot data and save to png file
        fig = plt.figure()
        plt.plot(data['t'], theta, label='theta')
        plt.plot(data['t'], thetadot, label='thetadot')
        plt.legend()
        plt.savefig('./results/Example_Trajectory_' + str(self.epochs) + '_Epochs.png' )


        #ANIMATED TRAJECTORY
        print('Generating Animated Tragjectory...')
        filename='./results/Animated_Trajectory_' + str(self.epochs) + '_Epochs.gif'
        writer='imagemagick'
        s = self.env.reset()
        s_traj = [s]
        done = False
        while not done:
            (mu, std) = self.actor(torch.from_numpy(s))
            dist = torch.distributions.normal.Normal(mu, std)
            a = dist.sample().numpy()
            s, r, done = self.env.step(a)
            s_traj.append(s)
        fig = plt.figure(figsize=(5, 4))
        ax = fig.add_subplot(111, autoscale_on=False, xlim=(-1.2, 1.2), ylim=(-1.2, 1.2))
        ax.set_aspect('equal')
        ax.grid()
        line, = ax.plot([], [], 'o-', lw=2)
        text = ax.set_title('')

        def animate(i):
            theta = s_traj[i][0]
            line.set_data([0, -np.sin(theta)], [0, np.cos(theta)])
            text.set_text(f'time = {i * self.env.dt:3.1f}')
            return line, text

        anim = animation.FuncAnimation(fig, animate, len(s_traj), interval=(1000 * self.env.dt), blit=True, repeat=False)
        anim.save(filename, writer=writer, fps=10)
        plt.close()


        #POLICY VISUALIZATION
        print('Generating Policy Visualization...')
        theta_range = np.linspace(-np.pi, np.pi, 200)
        theta_dot_range = np.linspace(-self.env.max_thetadot_for_init, self.env.max_thetadot_for_init, 200)
        policy = np.zeros((len(theta_range), len(theta_dot_range)))
        for i in range(len(theta_range)):
            for j in range(len(theta_dot_range)):
                state = torch.tensor([theta_range[i], theta_dot_range[j]], dtype=torch.float64)
                (mu, std) = self.actor(state)
                dist = torch.distributions.normal.Normal(mu, std)
                a = dist.sample().numpy()
                policy[i][j] = a
        fig = plt.figure()
        plt.imshow(policy, cmap='coolwarm')
        plt.xlabel('theta dot')
        plt.ylabel('theta')
        plt.colorbar()
        plt.title('Policy Visualization')
        fig.savefig('./results/Policy_Visualization_' + str(self.epochs) + '_Epochs.png')
        fig.clf()


        #VALUE FUNCTION VISUALIZATION
        print('Generating Value Function Visualization...')
        theta_range = np.linspace(-np.pi, np.pi, 200)
        theta_dot_range = np.linspace(-self.env.max_thetadot_for_init, self.env.max_thetadot_for_init, 200)
        value = np.zeros((len(theta_range), len(theta_dot_range)))
        for i in range(len(theta_range)):
            for j in range(len(theta_dot_range)):
                state = torch.tensor([theta_range[i], theta_dot_range[j]], dtype=torch.float64)
                V = self.critic(state).item()
                value[len(theta_range)-i][j] = V
        fig = plt.figure()
        plt.imshow(value, cmap='coolwarm')
        plt.xlabel('theta dot')
        plt.ylabel('theta')
        plt.colorbar()
        plt.title('Value Function Visualization')
        fig.savefig('./results/Value_Visualization_' + str(self.epochs) + '_Epochs.png')
        fig.clf()


        print('done')
Exemple #16
0
class A2C():
    """
    Advantage Actor-Critic RL agent. 
    
    Notes
    -----
    * GPU implementation is still work in progress.
    * Always uses 2 separate networks for the critic,one that learns from new experience 
      (student/critic) and the other one (critic_target/teacher)that is more conservative 
      and whose weights are updated through an exponential moving average of the weights 
      of the critic, i.e.
          target.params = (1-tau)*target.params + tau* critic.params
    * In the case of Monte Carlo estimation the critic_target is never used
    * Possible to use twin networks for the critic and the critic target for improved 
      stability. Critic target is used for updates of both the actor and the critic and
      its output is the minimum between the predictions of its two internal networks.
      
    """
    def __init__(self,
                 observation_space,
                 action_space,
                 lr,
                 gamma,
                 TD=True,
                 discrete=False,
                 project_dim=8,
                 hiddens=[64, 32],
                 twin=False,
                 tau=1.,
                 n_steps=1,
                 device='cpu',
                 debug=False):
        """
        Parameters
        ----------
        observation_space: int
            Number of flattened entries of the state
        action_space: int
            Number of (discrete) possible actions to take
        lr: float in [0,1]
            Learning rate
        gamma: float in [0,1]
            Discount factor
        TD: bool (default=True)
            If True, uses Temporal Difference for the critic's estimates
            Otherwise uses Monte Carlo estimation
        discrete: bool (default=False)
            If True, adds an embedding layer both in the actor 
            and the critic networks before processing the state.
            Should be used if the state is a simple integer in [0, observation_space -1]
        project_dim: int (default=8)
            Number of dimensions of the embedding space (e.g. number of dimensions of
            embedding(state) ). Higher dimensions are more expressive.
        hiddens: list of int (default = [64,32])
            List containing the number of neurons of each linear hidden layer.
            Same architecture is considered for the actor and the critic, except from the 
            output layer, than in one case has the dimension of the action space and a LogSoftmax
            activation, in the other outputs a scalar (state value)
        twin: bool (default=False)
            Enables twin networks both for critic and critic_target
        tau: float in [0,1] (default = 1.)
            Regulates how fast the critic_target gets updates, i.e. what percentage of the weights
            inherits from the critic. If tau=1., critic and critic_target are identical 
            at every step, if tau=0. critic_target is unchangable. 
            As a default this feature is disabled setting tau = 1, but if one wants to use it a good
            empirical value is 0.005.
         n_steps: int (default=1)
             Number of steps considered in TD update
        device: str in {'cpu','cuda'} (default='cpu')
            Implemented, but GPU slower than CPU because it's difficult to optimize a RL agent without
            a replay buffer, that can be used only in off-policy algorithms.
        """

        self.gamma = gamma
        self.lr = lr

        self.n_actions = action_space
        self.discrete = discrete
        self.TD = TD
        self.twin = twin
        self.tau = tau
        self.n_steps = n_steps

        self.actor = Actor(observation_space,
                           action_space,
                           discrete,
                           project_dim,
                           hiddens=hiddens)
        self.critic = Critic(observation_space,
                             discrete,
                             project_dim,
                             twin,
                             hiddens=hiddens)

        if self.TD:
            self.critic_trg = Critic(observation_space,
                                     discrete,
                                     project_dim,
                                     twin,
                                     target=True,
                                     hiddens=hiddens)

            # Init critic target identical to critic
            for trg_params, params in zip(self.critic_trg.parameters(),
                                          self.critic.parameters()):
                trg_params.data.copy_(params.data)

        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr)

        self.device = device
        self.actor.to(self.device)
        self.critic.to(self.device)
        if self.TD:
            self.critic_trg.to(self.device)

        if debug:
            print("=" * 10 + " A2C HyperParameters " + "=" * 10)
            print("Discount factor: ", self.gamma)
            print("Learning rate: ", self.lr)
            print("Action space: ", self.n_actions)
            print("Discrete state space: ", self.discrete)
            print("Temporal Difference learning: ", self.TD)
            if self.TD:
                print("Number of TD steps: ", self.n_steps)
            print("Twin networks: ", self.twin)
            print("Update critic target factor: ", self.tau)
            print("Device used: ", self.device)
            print("\n\n" + "=" * 10 + " A2C Architecture " + "=" * 10)
            print("Actor architecture: \n", self.actor)
            print("Critic architecture: \n", self.critic)
            print("Critic target architecture: ")
            if self.TD:
                print(self.critic_trg)
            else:
                print("Not used")

    def get_action(self, state, return_log=False):
        log_probs = self.forward(state)
        dist = torch.exp(log_probs)
        probs = Categorical(dist)
        action = probs.sample().item()
        if return_log:
            return action, log_probs.view(-1)[action]
        else:
            return action

    def forward(self, state):
        """
        Makes a tensor out of a numpy array state and then forward
        it with the actor network.
        
        Parameters
        ----------
        state:
            If self.discrete is True state.shape = (episode_len,)
            Otherwise state.shape = (episode_len, observation_space)
        """
        if self.discrete:
            state = torch.from_numpy(state).to(self.device)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0).to(
                self.device)
        log_probs = self.actor(state)
        return log_probs

    def update(self, *args):
        if self.TD:
            critic_loss, actor_loss = self.update_TD(*args)
        else:
            critic_loss, actor_loss = self.update_MC(*args)

        return critic_loss, actor_loss

    def update_TD(self, rewards, log_probs, states, done, bootstrap=None):

        ### Compute n-steps rewards, states, discount factors and done mask ###

        n_step_rewards = self.compute_n_step_rewards(rewards)
        if debug:
            print("n_step_rewards.shape: ", n_step_rewards.shape)
            print("rewards.shape: ", rewards.shape)
            print("n_step_rewards: ", n_step_rewards)
            print("rewards: ", rewards)

        if bootstrap is not None:
            done[bootstrap] = False
        if debug:
            print("done.shape: (before n_steps)", done.shape)
            print("done: (before n_steps)", done)

        if self.discrete:
            old_states = torch.tensor(states[:-1]).to(self.device)

            new_states, Gamma_V, done = self.compute_n_step_states(
                states, done)
            new_states = torch.tensor(new_states).to(self.device)

        else:
            old_states = torch.tensor(states[:, :-1]).float().to(self.device)

            new_states, Gamma_V, done = self.compute_n_step_states(
                states[0], done)
            new_states = torch.tensor(new_states).float().unsqueeze(0).to(
                self.device)

        if debug:
            print("done.shape: (after n_steps)", done.shape)
            print("Gamma_V.shape: ", Gamma_V.shape)
            print("done: (after n_steps)", done)
            print("Gamma_V: ", Gamma_V)
            print("old_states.shape: ", old_states.shape)
            print("new_states.shape: ", new_states.shape)

        ### Wrap variables into tensors ###

        done = torch.LongTensor(done.astype(int)).to(self.device)
        log_probs = torch.stack(log_probs).to(self.device)
        n_step_rewards = torch.tensor(n_step_rewards).float().to(self.device)
        Gamma_V = torch.tensor(Gamma_V).float().to(self.device)

        ### Update critic and then actor ###
        critic_loss = self.update_critic_TD(n_step_rewards, new_states,
                                            old_states, done, Gamma_V)
        actor_loss = self.update_actor_TD(n_step_rewards, log_probs,
                                          new_states, old_states, done,
                                          Gamma_V)

        return critic_loss, actor_loss

    def update_critic_TD(self, n_step_rewards, new_states, old_states, done,
                         Gamma_V):

        # Compute loss

        with torch.no_grad():
            V_trg = self.critic_trg(new_states).squeeze()
            if debug:
                print("V_trg.shape (after critic): ", V_trg.shape)
            V_trg = (1 - done) * Gamma_V * V_trg + n_step_rewards
            if debug:
                print("V_trg.shape (after sum): ", V_trg.shape)
            V_trg = V_trg.squeeze()
            if debug:
                print("V_trg.shape (after squeeze): ", V_trg.shape)

        if self.twin:
            V1, V2 = self.critic(old_states)
            loss1 = 0.5 * F.mse_loss(V1.squeeze(), V_trg)
            loss2 = 0.5 * F.mse_loss(V2.squeeze(), V_trg)
            loss = loss1 + loss2
        else:
            V = self.critic(old_states).squeeze()
            loss = F.mse_loss(V, V_trg)

        # Backpropagate and update

        self.critic_optim.zero_grad()
        loss.backward()
        self.critic_optim.step()

        # Update critic_target: (1-tau)*old + tau*new

        for trg_params, params in zip(self.critic_trg.parameters(),
                                      self.critic.parameters()):
            trg_params.data.copy_((1. - self.tau) * trg_params.data +
                                  self.tau * params.data)

        return loss.item()

    def update_actor_TD(self, n_step_rewards, log_probs, new_states,
                        old_states, done, Gamma_V):

        # Compute gradient

        if self.twin:
            V1, V2 = self.critic(old_states)
            V_pred = torch.min(V1.squeeze(), V2.squeeze())
            V1_new, V2_new = self.critic(new_states)
            V_new = torch.min(V1_new.squeeze(), V2_new.squeeze())
            V_trg = (1 - done) * Gamma_V * V_new + n_step_rewards
        else:
            V_pred = self.critic(old_states).squeeze()
            V_trg = (1 - done) * Gamma_V * self.critic(
                new_states).squeeze() + n_step_rewards

        A = V_trg - V_pred
        policy_gradient = -log_probs * A
        if debug:
            print("V_trg.shape: ", V_trg.shape)
            print("V_pred.shape: ", V_pred.shape)
            print("A.shape: ", A.shape)
            print("policy_gradient.shape: ", policy_gradient.shape)
        policy_grad = torch.sum(policy_gradient)

        # Backpropagate and update

        self.actor_optim.zero_grad()
        policy_grad.backward()
        self.actor_optim.step()

        return policy_grad.item()

    def compute_n_step_rewards(self, rewards):
        """
        Computes n-steps discounted reward padding with zeros the last elements of the trajectory.
        This means that the rewards considered are AT MOST n, but can be less for the last n-1 elements.
        """
        T = len(rewards)

        # concatenate n_steps zeros to the rewards -> they do not change the cumsum
        r = np.concatenate((rewards, [0 for _ in range(self.n_steps)]))

        Gamma = np.array([self.gamma**i for i in range(r.shape[0])])

        # reverse everything to use cumsum in right order, then reverse again
        Gt = np.cumsum(r[::-1] * Gamma[::-1])[::-1]

        G_nstep = Gt[:T] - Gt[
            self.n_steps:]  # compute n-steps discounted return

        Gamma = Gamma[:T]

        assert len(
            G_nstep) == T, "Something went wrong computing n-steps reward"

        n_steps_r = G_nstep / Gamma

        return n_steps_r

    def compute_n_step_states(self, states, done):
        """
        Computes n-steps target states (to be used by the critic as target values together with the
        n-steps discounted reward). For last n-1 elements the target state is the last one available.
        Adjusts also the `done` mask used for disabling the bootstrapping in the case of terminal states
        and returns Gamma_V, that are the discount factors for the target state-values, since they are 
        n-steps away (except for the last n-1 states, whose discount is adjusted accordingly).
        
        Return
        ------
        new_states, Gamma_V, done: arrays with first dimension = len(states)-1
        """

        # Compute indexes for (at most) n-step away states

        n_step_idx = np.arange(len(states) - 1) + self.n_steps
        diff = n_step_idx - len(states) + 1
        mask = (diff > 0)
        n_step_idx[mask] = len(states) - 1

        # Compute new states

        new_states = states[n_step_idx]

        # Compute discount factors

        pw = np.array([self.n_steps for _ in range(len(new_states))])
        pw[mask] = self.n_steps - diff[mask]
        Gamma_V = self.gamma**pw

        # Adjust done mask

        mask = (diff >= 0)
        done[mask] = done[-1]

        return new_states, Gamma_V, done

    def update_MC(self, rewards, log_probs, states, done, bootstrap=None):

        ### Compute MC discounted returns ###

        if bootstrap is not None:

            if bootstrap[-1] == True:

                last_state = torch.tensor(states[0, -1, :]).float().to(
                    self.device).view(1, -1)

                if self.twin:
                    V1, V2 = self.critic(last_state)
                    V_bootstrap = torch.min(V1,
                                            V2).cpu().detach().numpy().reshape(
                                                1, )
                else:
                    V_bootstrap = self.critic(
                        last_state).cpu().detach().numpy().reshape(1, )

                rewards = np.concatenate((rewards, V_bootstrap))

        Gamma = np.array([self.gamma**i for i in range(rewards.shape[0])])
        # reverse everything to use cumsum in right order, then reverse again
        Gt = np.cumsum(rewards[::-1] * Gamma[::-1])[::-1]
        # Rescale so that present reward is never discounted
        discounted_rewards = Gt / Gamma

        if bootstrap is not None:
            if bootstrap[-1] == True:
                discounted_rewards = discounted_rewards[:-1]  # drop last

        ### Wrap variables into tensors ###

        dr = torch.tensor(discounted_rewards).float().to(self.device)

        if self.discrete:
            old_states = torch.tensor(states[:-1]).to(self.device)
            new_states = torch.tensor(states[1:]).to(self.device)
        else:
            old_states = torch.tensor(states[:, :-1]).float().to(self.device)
            new_states = torch.tensor(states[:, 1:]).float().to(self.device)

        done = torch.LongTensor(done.astype(int)).to(self.device)
        log_probs = torch.stack(log_probs).to(self.device)

        ### Update critic and then actor ###

        critic_loss = self.update_critic_MC(dr, old_states)
        actor_loss = self.update_actor_MC(dr, log_probs, old_states)

        return critic_loss, actor_loss

    def update_critic_MC(self, dr, old_states):

        # Compute loss

        if self.twin:
            V1, V2 = self.critic(old_states)
            V_pred = torch.min(V1.squeeze(), V2.squeeze())
        else:
            V_pred = self.critic(old_states).squeeze()

        loss = F.mse_loss(V_pred, dr)

        # Backpropagate and update

        self.critic_optim.zero_grad()
        loss.backward()
        self.critic_optim.step()

        return loss.item()

    def update_actor_MC(self, dr, log_probs, old_states):

        # Compute gradient

        if self.twin:
            V1, V2 = self.critic(old_states)
            V_pred = torch.min(V1.squeeze(), V2.squeeze())
        else:
            V_pred = self.critic(old_states).squeeze()

        A = dr - V_pred
        policy_gradient = -log_probs * A
        policy_grad = torch.sum(policy_gradient)

        # Backpropagate and update

        self.actor_optim.zero_grad()
        policy_grad.backward()
        self.actor_optim.step()

        return policy_grad.item()
Exemple #17
0
class Agent():
    """ Interacts with and learns from the environment. """
    def __init__(self, state_size, action_size, fc1_units, fc2_units):
        """Initialize an Agent object.

        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = torch.manual_seed(SEED)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, fc1_units,
                                 fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, fc1_units,
                                  fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, fc1_units,
                                   fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, fc1_units,
                                    fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OrnsteinUhlenbeck(action_size, SEED)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED,
                                   device)

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn only every N_TIME_STEPS
        if time_step % N_TIME_STEPS != 0:
            return

        # Learn if enough samples are available in replay buffer
        if len(self.memory) > BATCH_SIZE:
            for i in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """ Returns actions for given state as per current policy. """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets from current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
        local_model: PyTorch model (weights will be copied from)
        target_model: PyTorch model (weights will be copied to)
        tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def store(self):
        torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth')

    def load(self):
        if os.path.isfile('checkpoint_actor.pth') and os.path.isfile(
                'checkpoint_critic.pth'):
            print("=> loading checkpoints for Actor and Critic... ")
            self.actor_local.load_state_dict('checkpoint_actor')
            self.critic_local.load_state_dict('checkpoint_critic')
            print("done !")
        else:
            print("no checkpoints found for Actor and Critic...")
Exemple #18
0
class Agent():
    def __init__(self, state_size, action_size):
        super().__init__()
        gpu = torch.cuda.is_available()
        if (gpu):
            print('GPU/CUDA works! Happy fast training :)')
            torch.cuda.current_device()
            torch.cuda.empty_cache()
            self.device = torch.device("cuda")
        else:
            print('training on cpu...')
            self.device = torch.device("cpu")

        self.actor = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=0.001,
                                       weight_decay=0)
        self.replay_buffer = deque(maxlen=1000000)  #1m
        self.gamma = 0.95  #0.99
        self.batch_size = 128
        self.tau = 0.001
        self.seed = random.seed(2)
        self.noise = OUNoise((20, action_size), 2)
        self.target_network_update(self.actor_target, self.actor, 1.0)
        self.target_network_update(self.critic_target, self.critic, 1.0)

    def select_actions(self, state):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.numpy()
        self.actor.train()
        actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def add(self, sars):
        self.replay_buffer.append(sars)

    def train(self):
        if (len(self.replay_buffer) > self.batch_size):
            states, actions, rewards, next_states, dones = self.sample()
            next_actions = self.actor_target(next_states)
            next_state_q_v = self.critic_target(next_states, next_actions)
            #print(next_state_q_v)
            q_targets = rewards + (self.gamma * next_state_q_v * (1 - dones))
            current_q_v = self.critic(states, actions)
            critic_loss = F.mse_loss(current_q_v, q_targets)
            self.critic_optim.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm(self.critic.parameters(), 1)
            self.critic_optim.step()

            actions = self.actor(states)
            actor_loss = -self.critic(states, actions).mean()
            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()
            self.target_network_update(self.actor_target, self.actor, self.tau)
            self.target_network_update(self.critic_target, self.critic,
                                       self.tau)

    def target_network_update(self, target_network, network, tau):
        for network_param, target_param in zip(network.parameters(),
                                               target_network.parameters()):
            target_param.data.copy_(tau * network_param.data +
                                    (1.0 - tau) * target_param.data)

    def sample(self):
        samples = random.sample(self.replay_buffer, k=self.batch_size)
        states = torch.tensor([s[0] for s in samples]).float().to(self.device)
        actions = torch.tensor([s[1] for s in samples]).float().to(self.device)
        rewards = torch.tensor([s[2] for s in samples
                                ]).float().unsqueeze(1).to(self.device)
        next_states = torch.tensor([s[3]
                                    for s in samples]).float().to(self.device)
        dones = torch.tensor([s[4] for s in samples
                              ]).float().unsqueeze(1).to(self.device)
        return states, actions, rewards, next_states, dones
Exemple #19
0
class PolicyGradEnt():
    """
    Implements an RL agent with policy gradient method.
    
    Notes
    -----
    GPU implementation is just sketched; it works but it's slower than with CPU.
    """
    def __init__(self,
                 observation_space,
                 action_space,
                 lr,
                 gamma,
                 H,
                 discrete=True,
                 project_dim=4,
                 device='cpu'):
        """
        Parameters
        ----------
        observation_space: int
            Number of flattened entries of the state
        action_space: int
            Number of (discrete) possible actions to take
        """

        self.gamma = gamma
        self.lr = lr
        self.H = H  # entropy coeff

        self.n_actions = action_space
        self.discrete = discrete
        if self.discrete:
            self.net = Actor(observation_space, action_space, discrete,
                             project_dim)
        else:
            self.net = Actor(observation_space, action_space, discrete)
        self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr)

        self.device = device
        self.net.to(self.device)  # move network to device

    def get_action(self, state, return_log=False):
        log_probs = self.forward(state)
        dist = torch.exp(log_probs)

        probs = Categorical(dist)
        action = probs.sample().item()

        if return_log:
            return action, log_probs.view(-1)[action], dist
        else:
            return action

    def forward(self, state):
        if self.discrete:
            state = torch.from_numpy(state).to(self.device)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0).to(
                self.device)
        return self.net(state)

    def update(self, rewards, log_probs, distributions):

        ### Compute MC discounted returns ###

        Gamma = np.array([self.gamma**i for i in range(rewards.shape[0])])
        # reverse everything to use cumsum in right order, then reverse again
        Gt = np.cumsum(rewards[::-1] * Gamma[::-1])[::-1]
        # Rescale so that present reward is never discounted
        discounted_rewards = Gt / Gamma

        dr = torch.tensor(discounted_rewards).to(self.device)
        dr = (dr - dr.mean()) / dr.std()

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, dr):
            policy_gradient.append(
                -log_prob * Gt)  # "-" for minimization instead of maximization

        distributions = torch.stack(distributions).squeeze()  # shape = (T,2)
        # Compute negative entropy (no - in front)
        entropy = torch.sum(distributions * torch.log(distributions),
                            axis=1).sum()
        policy_grad = torch.stack(policy_gradient).sum()
        loss = policy_grad + self.H * entropy

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        return policy_grad.item()
Exemple #20
0
class DDPG():
    def __init__(self,
                 env,
                 log_dir,
                 gamma=0.99,
                 batch_size=64,
                 sigma=0.2,
                 batch_norm=True,
                 merge_layer=2,
                 buffer_size=int(1e6),
                 buffer_min=int(1e4),
                 tau=1e-3,
                 Q_wd=1e-2,
                 num_episodes=1000):

        self.s_dim = env.reset().shape[0]
        # self.a_dim = env.action_space.shape[0]
        self.a_dim = env.action_space2.shape[0]
        # self.a_dim = 1

        self.env = env
        # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm)
        self.mu = Actor(self.s_dim,
                        self.a_dim,
                        env.action_space2,
                        batch_norm=batch_norm)
        self.Q = Critic(self.s_dim,
                        self.a_dim,
                        batch_norm=batch_norm,
                        merge_layer=merge_layer)
        self.targ_mu = copy.deepcopy(self.mu).eval()
        self.targ_Q = copy.deepcopy(self.Q).eval()
        self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim),
                                       sigma=sigma * torch.ones(self.a_dim))
        self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim)
        self.buffer_min = buffer_min
        self.mse_fn = torch.nn.MSELoss()
        self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=1e-3,
                                            weight_decay=Q_wd)

        self.gamma = gamma
        self.batch_size = batch_size
        self.num_episodes = num_episodes
        self.tau = tau
        self.log_dir = log_dir

        self.fill_buffer()

    #updates the target network to slowly track the main network
    def track_network(self, target, main):
        with torch.no_grad():
            for pt, pm in zip(target.parameters(), main.parameters()):
                pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data)

    # updates the target nets to slowly track the main ones
    def track_networks(self):
        self.track_network(self.targ_mu, self.mu)
        self.track_network(self.targ_Q, self.Q)

    def run_episode(self):
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        t = 0
        tot_r = 0
        while not done:

            self.mu = self.mu.eval()
            # a_ = torch.squeeze(self.mu(s)).detach().numpy()
            a = torch.squeeze(self.mu(s)).detach().numpy()
            # print("a {}\n".format(a))

            self.mu = self.mu.train()

            ac_noise = self.noise().detach().numpy()
            a = a + ac_noise
            # print("ac_noise {}\n".format(ac_noise))
            # print("a+ac_noise {}\n".format(a))

            if a < self.env.action_space2.low:
                a = self.env.action_space2.low
            elif a > self.env.action_space2.high:
                a = self.env.action_space2.high

            s = s.detach().numpy()

            a_updated = self.LQR(s, a)
            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            tot_r += r
            self.buffer.add_tuple(s, a, r, s_p, done)

            s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample(
                batch_size=self.batch_size)

            # update critic
            with torch.no_grad():
                q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch))
                q_p_pred = torch.squeeze(q_p_pred)
                y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred
            self.Q_optimizer.zero_grad()
            q_pred = self.Q(s_batch, a_batch)
            q_pred = torch.squeeze(q_pred)
            #print(torch.mean(q_pred))
            Q_loss = self.mse_fn(q_pred, y)
            Q_loss.backward(retain_graph=False)
            self.Q_optimizer.step()

            # update actor
            self.mu_optimizer.zero_grad()
            q_pred_mu = self.Q(s_batch, self.mu(s_batch))
            q_pred_mu = torch.squeeze(q_pred_mu)
            #print(torch.mean(q_pred_mu))
            mu_loss = -torch.mean(q_pred_mu)
            # print(mu_loss)
            mu_loss.backward(retain_graph=False)
            #print(torch.sum(self.mu.layers[0].weight.grad))
            self.mu_optimizer.step()
            self.track_networks()

            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
            t += 1
        return tot_r, t

    def train(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models/model_' + str(i))
        np.save(self.log_dir + '/results_train.npy', np.array(results))

    def train1(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models1/model_' + str(i))
        np.save(self.log_dir + '/results_train1.npy', np.array(results))

    def train2(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models2/model_' + str(i))
        np.save(self.log_dir + '/results_train2.npy', np.array(results))

    def train3(self):
        results = []
        for i in range(self.num_episodes):
            r, t = self.run_episode()
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
            results.append([r, t])

            if i % 10 == 0:
                torch.save(self.mu, self.log_dir + '/models3/model_' + str(i))
        np.save(self.log_dir + '/results_train3.npy', np.array(results))

    def eval_all(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval.npy', np.array(results))

    def eval_all1(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval1.npy', np.array(results))

    def eval_all2(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval2.npy', np.array(results))

    def eval_all3(self, model_dir, num_eps=5):
        results = []

        for model_fname in sorted(os.listdir(model_dir),
                                  key=lambda x: int(x.split('_')[1])):
            print(model_fname)
            mu = torch.load(os.path.join(model_dir, model_fname))
            r, t = self.eval(num_eps=num_eps, mu=mu)
            results.append([r, t])
        np.save(self.log_dir + '/results_eval3.npy', np.array(results))

    def eval(self, num_eps=10, mu=None):
        if mu == None:
            mu = self.mu

        results = []
        mu = mu.eval()
        for i in range(num_eps):
            r, t = self.run_eval_episode(mu=mu)
            results.append([r, t])
            print('{} reward: {:.2f}, length: {}'.format(i, r, t))
        return np.mean(results, axis=0)

    def run_eval_episode(self, mu=None):
        if mu == None:
            mu = self.mu
        done = False
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)
        tot_r = t = 0
        while not done:
            a = mu(s).view(-1).detach().numpy()

            a_updated = self.LQR(s, a)
            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            tot_r += r
            t += 1
            s = torch.tensor(s_p.astype(np.float32), requires_grad=False)
        return tot_r, t

    def LQR(self, s, a):

        FPS = 50
        SCALE = 30.0  # affects how fast-paced the game is, forces should be adjusted as well
        VIEWPORT_W = 600
        VIEWPORT_H = 400

        gravity = 9.8 / FPS / FPS  # gravity is enhanced by scaling
        thrust_main_max = gravity / 0.56
        thrust_side_max = thrust_main_max * 0.095 / 0.7  # m/frame^2 # determined by test
        m_main_inv = thrust_main_max  # gravity*0.57
        m_side_inv = thrust_side_max  # gravity*0.225
        a_i_inv = 0.198 / 100  # rad/frame^2 # determined by test # not depend on SCALE
        align = 0.87  # 0.87 = sin30

        # target point set
        x_target = 0
        y_target = 0  # the landing point is 0
        Vx_target = 0
        Vy_target = 0
        theta_target = 0
        omega_target = 0

        if a < self.env.action_space2.low:
            a = self.env.action_space2.low
        elif a > self.env.action_space2.high:
            a = self.env.action_space2.high

        a_float = float(a)
        y_target = s[1] * (VIEWPORT_H / SCALE /
                           2) / a_float  # 1.6 succeeds all the times

        X = np.array([ \
        [s[0]*(VIEWPORT_W/SCALE/2)-x_target], \
        [s[1]*(VIEWPORT_H/SCALE/2)-y_target], \
        [s[2]/(VIEWPORT_W/SCALE/2)-Vx_target], \
        [s[3]/(VIEWPORT_H/SCALE/2)-Vy_target], \
        [s[4]-theta_target], \
        [s[5]/20.0-omega_target]])

        # print("X {}\n".format(X))

        A = np.array([ \
        [0, 0, 1, 0, 0, 0], \
        [0, 0, 0, 1, 0, 0], \
        [0, 0, 0, 0, -1*gravity, 0], \
        [0, 0, 0, 0, 0, 0], \
        [0, 0, 0, 0, 0, 1], \
        [0, 0, 0, 0, 0, 0]])

        B = np.array([ \
        [0, 0], \
        [0, 0], \
        [0, m_side_inv*align], \
        [1*m_main_inv, 0], \
        [0, 0], \
        [0, -1*a_i_inv]])

        sigma = np.array([ \
        [0], \
        [0], \
        [0], \
        [-1*gravity], \
        [0], \
        [0]])

        # gravity compensation
        BTB = np.dot(B.T, B)
        u_sigma = -1 * np.linalg.inv(BTB).dot(B.T).dot(sigma)
        # print("u_sigma {}\n".format(u_sigma))

        # Design of LQR
        # Solve Riccati equation to find a optimal control input
        R = np.array([ \
        [1, 0], \
        [0, 1]])

        Q = np.array([ \
        [1, 0, 0, 0, 0, 0], \
        [0, 1, 0, 0, 0, 0], \
        [0, 0, 1, 0, 0, 0], \
        [0, 0, 0, 1, 0, 0], \
        [0, 0, 0, 0, 100, 0], \
        [0, 0, 0, 0, 0, 100]])

        # Solving Riccati equation
        P = sp.linalg.solve_continuous_are(A, B, Q, R)
        # print("P {}\n".format(P))

        # u = -KX
        # K = R-1*Rt*P
        K = np.linalg.inv(R).dot(B.T).dot(P)
        thrust = -1 * np.dot(K, X) + u_sigma

        BK = np.dot(B, K)
        A_ = A - BK
        a_eig = np.linalg.eig(A_)
        a_sort = np.sort(a_eig[0])
        # print("eigen values {}\n".format(a_sort))

        # print("thrust {}\n".format(thrust))
        # thrust[0] = 0
        # thrust[1] = 1

        if s[1] < 0.3 / SCALE:
            thrust[0] = 0
            thrust[1] = 0

        # conversion to compensate main thruster's tricky thrusting
        thrust[0] = thrust[0] / 0.5 - 1.0

        if self.env.continuous:
            a_updated = np.array([thrust[0], thrust[1]])
            # print("a_updated {}\n".format(a_updated))
            # a = (0.5, 0)
            a_updated = np.clip(
                a_updated, -1,
                +1)  #  if the value is less than 0.5, it's ignored
            # print("a_updated * {}\n".format(a_updated))
        else:
            print("please change to cts mode")

        return a_updated

    def fill_buffer(self):
        print('Filling buffer')
        s = torch.tensor(self.env.reset().astype(np.float32),
                         requires_grad=False)

        temp_number = 0

        while self.buffer.size < self.buffer_min:

            # self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32)
            a = np.random.uniform(self.env.action_space2.low,
                                  self.env.action_space2.high,
                                  size=(self.a_dim))
            a_updated = self.LQR(s, a)

            if temp_number < 3:
                print("a {}\n".format(a), "actions:",
                      "{} {}".format(a_updated[0], a_updated[1]))
                # print("a_updated*** {}\n".format(a_updated))
                temp_number += 1

            # s_p, r, done, _ = self.env.step(a)
            s_p, r, done, _ = self.env.step(a_updated)

            if done:
                self.env.reset()

            self.buffer.add_tuple(s, a, r, s_p, done)
            s = s_p
Exemple #21
0
class DyNODESacAgent(object):
    """DyNODE-SAC."""
    def __init__(self,
                 obs_shape,
                 action_shape,
                 device,
                 model_kind,
                 kind='D',
                 step_MVE=5,
                 hidden_dim=256,
                 discount=0.99,
                 init_temperature=0.01,
                 alpha_lr=1e-3,
                 alpha_beta=0.9,
                 actor_lr=1e-3,
                 actor_beta=0.9,
                 actor_log_std_min=-10,
                 actor_log_std_max=2,
                 critic_lr=1e-3,
                 critic_beta=0.9,
                 critic_tau=0.005,
                 critic_target_update_freq=2,
                 model_lr=1e-3,
                 log_interval=100):

        self.device = device
        self.discount = discount
        self.critic_tau = critic_tau
        self.critic_target_update_freq = critic_target_update_freq
        self.log_interval = log_interval
        self.step_MVE = step_MVE
        self.model_kind = model_kind

        self.actor = Actor(obs_shape, action_shape, hidden_dim,
                           actor_log_std_min, actor_log_std_max).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr,
                                                betas=(actor_beta, 0.999))

        self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device)
        self.critic_target = Critic(obs_shape, action_shape,
                                    hidden_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr,
                                                 betas=(critic_beta, 0.999))

        self.log_alpha = torch.tensor(np.log(init_temperature)).to(device)
        self.log_alpha.requires_grad = True
        self.target_entropy = -np.prod(
            action_shape)  # set target entropy to -|A|
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr,
                                                    betas=(alpha_beta, 0.999))

        if self.model_kind == 'dynode_model':
            self.model = DyNODE(obs_shape,
                                action_shape,
                                hidden_dim_p=200,
                                hidden_dim_r=200).to(device)
        elif self.model_kind == 'nn_model':
            self.model = NN_Model(obs_shape,
                                  action_shape,
                                  hidden_dim_p=200,
                                  hidden_dim_r=200,
                                  kind=kind).to(device)
        else:
            assert 'model is not supported'

        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=model_lr)

        self.train()
        self.critic_target.train()

    def train(self, training=True):
        self.training = training
        self.actor.train(training)
        self.critic.train(training)
        self.model.train(training)

    @property
    def alpha(self):
        return self.log_alpha.exp()

    def select_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device)
            obs = obs.unsqueeze(0)
            mu, _, _, _ = self.actor(obs,
                                     compute_pi=False,
                                     compute_log_pi=False)
            return mu.cpu().data.numpy().flatten()

    def sample_action(self, obs):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device)
            obs = obs.unsqueeze(0)
            mu, pi, _, _ = self.actor(obs, compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()

    def update_model(self, replay_buffer, L, step):

        if self.model_kind == 'dynode_model':
            obs_m, action_m, reward_m, next_obs_m, _ = replay_buffer.sample_dynode(
            )
            transition_loss, reward_loss = self.model.loss(
                obs_m, action_m, reward_m, next_obs_m)
            model_loss = transition_loss + reward_loss
        elif self.model_kind == 'nn_model':
            obs, action, reward, next_obs, _ = replay_buffer.sample()
            transition_loss, reward_loss = self.model.loss(
                obs, action, reward, next_obs)
            model_loss = transition_loss + reward_loss
        else:
            assert 'model is not supported'

        # Optimize the Model
        self.model_optimizer.zero_grad()
        model_loss.backward()
        self.model_optimizer.step()

        if step % self.log_interval == 0:
            L.log('train/model_loss', model_loss, step)

    def MVE_prediction(self, replay_buffer, L, step):

        obs, action, reward, next_obs, not_done = replay_buffer.sample()

        trajectory = []
        next_ob = next_obs
        with torch.no_grad():
            while len(trajectory) < self.step_MVE:
                ob = next_ob
                _, act, _, _ = self.actor(ob)
                rew, next_ob = self.model(ob, act)
                trajectory.append([ob, act, rew, next_ob])

            _, next_action, log_pi, _ = self.actor(next_ob)
            target_Q1, target_Q2 = self.critic_target(next_ob, next_action)
            ret = torch.min(target_Q1,
                            target_Q2) - self.alpha.detach() * log_pi

        critic_loss = 0
        for ob, act, rew, _ in reversed(trajectory):
            current_Q1, current_Q2 = self.critic(ob, act)
            ret = rew + self.discount * ret
            # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean()
            critic_loss = critic_loss + F.mse_loss(
                current_Q1, ret) + F.mse_loss(current_Q2, ret)
        current_Q1, current_Q2 = self.critic(obs, action)
        ret = reward + self.discount * ret
        # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean()
        critic_loss = critic_loss + F.mse_loss(current_Q1, ret) + F.mse_loss(
            current_Q2, ret)
        critic_loss = critic_loss / (self.step_MVE + 1)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # actor
        _, pi, log_pi, log_std = self.actor(obs)
        actor_Q1, actor_Q2 = self.critic(obs.detach(), pi)
        actor_Q = torch.min(actor_Q1, actor_Q2)
        actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean()

        # optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.log_alpha_optimizer.zero_grad()
        alpha_loss = (self.alpha *
                      (-log_pi - self.target_entropy).detach()).mean()
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

    def update_critic(self, obs, action, reward, next_obs, not_done, L, step):
        with torch.no_grad():
            _, policy_action, log_pi, _ = self.actor(next_obs)
            target_Q1, target_Q2 = self.critic_target(next_obs, policy_action)
            target_V = torch.min(target_Q1,
                                 target_Q2) - self.alpha.detach() * log_pi
            target_Q = reward + (not_done * self.discount * target_V)

        # get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)
        if step % self.log_interval == 0:
            L.log('train_critic/loss', critic_loss, step)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        self.critic.log(L, step)

    def update_actor_and_alpha(self, obs, L, step):
        _, pi, log_pi, log_std = self.actor(obs)
        actor_Q1, actor_Q2 = self.critic(obs, pi)

        actor_Q = torch.min(actor_Q1, actor_Q2)
        actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean()

        if step % self.log_interval == 0:
            L.log('train_actor/loss', actor_loss, step)
            L.log('train_actor/target_entropy', self.target_entropy, step)
        entropy = 0.5 * log_std.shape[1] * (
            1.0 + np.log(2 * np.pi)) + log_std.sum(dim=-1)
        if step % self.log_interval == 0:
            L.log('train_actor/entropy', entropy.mean(), step)

        # optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.actor.log(L, step)

        self.log_alpha_optimizer.zero_grad()
        alpha_loss = (self.alpha *
                      (-log_pi - self.target_entropy).detach()).mean()
        if step % self.log_interval == 0:
            L.log('train_alpha/loss', alpha_loss, step)
            L.log('train_alpha/value', self.alpha, step)
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

    def update(self, replay_buffer, L, step):

        if step < 2000:
            for _ in range(2):
                obs, action, reward, next_obs, not_done = replay_buffer.sample(
                )
                self.update_critic(obs, action, reward, next_obs, not_done, L,
                                   step)
                self.update_actor_and_alpha(obs, L, step)

            if step % self.log_interval == 0:
                L.log('train/batch_reward', reward.mean(), step)

        else:
            obs, action, reward, next_obs, not_done = replay_buffer.sample()

            if step % self.log_interval == 0:
                L.log('train/batch_reward', reward.mean(), step)

            self.MVE_prediction(replay_buffer, L, step)
            self.update_critic(obs, action, reward, next_obs, not_done, L,
                               step)
            self.update_actor_and_alpha(obs, L, step)

        if step % self.critic_target_update_freq == 0:
            utils.soft_update_params(self.critic.Q1, self.critic_target.Q1,
                                     self.critic_tau)
            utils.soft_update_params(self.critic.Q2, self.critic_target.Q2,
                                     self.critic_tau)

    def save(self, model_dir, step):
        torch.save(self.actor.state_dict(),
                   '%s/actor_%s.pt' % (model_dir, step))
        torch.save(self.critic.state_dict(),
                   '%s/critic_%s.pt' % (model_dir, step))

    def save_model(self, model_dir, step):
        torch.save(self.model.state_dict(),
                   '%s/model_%s.pt' % (model_dir, step))

    def load(self, model_dir, step):
        self.actor.load_state_dict(
            torch.load('%s/actor_%s.pt' % (model_dir, step)))
        self.critic.load_state_dict(
            torch.load('%s/critic_%s.pt' % (model_dir, step)))
class A2C_v1():
    """
    Implements Advantage Actor Critic RL agent. Uses episode trajectories to update.
    
    Notes
    -----
    GPU implementation is just sketched; it works but it's slower than with CPU.
    """
    
    def __init__(self, observation_space, action_space, lr, gamma, 
                 device='cpu', discrete=False, project_dim=8):
        """
        Parameters
        ----------
        observation_space: int
            Number of flattened entries of the state
        action_space: int
            Number of (discrete) possible actions to take
        """
        
        self.gamma = gamma
        self.lr = lr
        
        self.n_actions = action_space
        self.discrete = discrete
        if self.discrete:
            self.actor = DiscreteActor(observation_space, action_space, project_dim)
            self.critic = DiscreteCritic(observation_space, project_dim)
        else:
            self.actor = Actor(observation_space, action_space)
            self.critic = Critic(observation_space)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr)
        
        self.device = device 
        ### Not implemented ###
        #self.actor.to(self.device) # move network to device
        #self.critic.to(self.device)
        
    def get_action(self, state, return_log=False):
        log_probs = self.forward(state)
        dist = torch.exp(log_probs)
        probs = Categorical(dist)
        action =  probs.sample().item()
        if return_log:
            return action, log_probs.view(-1)[action]
        else:
            return action
    
    def forward(self, state):
        if self.discrete:
            state = torch.from_numpy(state)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0) 
        log_probs = self.actor(state)
        return log_probs
    
    def update(self, rewards, log_probs, states, done):
        # Wrap variables in tensors
        if self.discrete:
            old_states = torch.tensor(states[:,:-1])
            new_states = torch.tensor(states[:,1:])
        else:
            old_states = torch.tensor(states[:,:-1]).float()
            new_states = torch.tensor(states[:,1:]).float()
        done = torch.LongTensor(done.astype(int))
        #log_probs = torch.tensor(log_probs.astype(float)) ### ERROR HERE
        log_probs = torch.stack(log_probs)
        # Update critic and then actor
        self.update_critic(rewards, new_states, old_states, done)
        self.update_actor(rewards, log_probs, new_states, old_states)
        return
    
    def update_critic(self, rewards, new_states, old_states, done):
        """
        Minimize \sum_{t=0}^{T-1}(rewards[t] + gamma V(new_states[t]) - V(old_states[t]) )**2
        where V(state) is the prediction of the critic.
        
        Parameters
        ----------
        reward: shape (T,)
        old_states, new_states: shape (T, observation_space)
        """
        rewards = torch.tensor(rewards)    #.to(self.device)
        #print("rewards.shape ", rewards.shape)
        # Predictions
        V_pred = self.critic(old_states).squeeze()
        #print("V_pred.shape ", V_pred.shape)
        # Targets
        V_trg = self.critic(new_states).squeeze().detach()
        #print("V_trg.shape ", V_trg.shape)
        V_trg = (1-done)*self.gamma*V_trg + rewards
        #print("V_trg.shape ", V_trg.shape)
        # MSE loss
        loss = torch.sum((V_pred - V_trg)**2)
        # backprop and update
        self.critic_optim.zero_grad()
        loss.backward()
        self.critic_optim.step()
        return
    
    def update_actor(self, rewards, log_probs, new_states, old_states):
        # Discount factors
        Gamma = np.array([self.gamma**i for i in range(rewards.shape[1])]).reshape(1,-1)
        # reverse everything to use cumsum in right order, then reverse again
        Gt = np.cumsum(rewards[:,::-1]*Gamma[:,::-1], axis=1)[:,::-1]
        # Rescale so that present reward is never discounted
        discounted_rewards =  Gt/Gamma
        # Wrap into tensor
        dr = torch.tensor(discounted_rewards).float()    #.to(self.device)
        #print("dr ", dr.shape)
        # Get value as baseline
        V = self.critic(old_states).squeeze()
        # Compute advantage as total (discounted) return - value
        A = dr - V 
        # Rescale to unitary variance for a trajectory (axis=1)
        #A = (A - A.mean(axis=1).unsqueeze(1))/(A.std(axis=1).unsqueeze(1))
        #print("A ", A.shape)
        #print("log_probs ", log_probs.shape)
        # Compute - gradient
        policy_gradient = - log_probs*A
        #print("policy_gradient ", policy_gradient.shape)
        # Use it as loss
        policy_grad = torch.sum(policy_gradient)
        # barckprop and update
        self.actor_optim.zero_grad()
        policy_grad.backward()
        self.actor_optim.step()
        return
class TD3MultiAgent:
    def __init__(self):
      
        self.max_action = 1
        self.policy_freq = 2
        self.policy_freq_it = 0
        self.batch_size = 512
        self.discount = 0.99
        self.replay_buffer = int(1e5)
        
        
        self.device = 'cuda'
        
        self.state_dim = 24
        self.action_dim = 2
        self.max_action = 1
        self.policy_noise = 0.1
        self.agents = 1
        
        self.random_period = 1e4
        
        self.tau = 5e-3
        
        self.replay_buffer = ReplayBuffer(self.replay_buffer)
        
        self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
#         self.actor.load_state_dict(torch.load('actor2.pth'))
#         self.actor_target.load_state_dict(torch.load('actor2.pth'))

        self.noise = OUNoise(2, 32)
        
        
        self.critic = Critic(48, self.action_dim).to(self.device)
        self.critic_target = Critic(48, self.action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

    
    def select_action_with_noise(self, state, i):
        import pdb
        ratio = len(self.replay_buffer)/self.random_period

        if len(self.replay_buffer)>self.random_period:
            
            state = torch.FloatTensor(state[i,:]).to(self.device)
            action = self.actor(state).cpu().data.numpy()

            if self.policy_noise != 0: 
                action = (action + self.noise.sample())
            return action.clip(-self.max_action,self.max_action)
        
        else:
            q= self.noise.sample()
            return q
   
    
    def step(self, i):
        if len(self.replay_buffer)>self.random_period/2:
            # Sample mini batch
#         if True:
            import pdb
            s, a, r, s_, d = self.replay_buffer.sample(self.batch_size)
            
            state = torch.FloatTensor(s[:,i,:]).to(self.device)
            action = torch.FloatTensor(a[:,i,:]).to(self.device)
            next_state = torch.FloatTensor(s_[:,i,:]).to(self.device)
            
            a_state = torch.FloatTensor(s).to(self.device).reshape(-1,48)
            a_action = torch.FloatTensor(a).to(self.device).reshape(-1,4)
            a_next_state = torch.FloatTensor(s_).to(self.device).reshape(-1,48)
            
            done = torch.FloatTensor(1 - d[:,i]).to(self.device)
            reward = torch.FloatTensor(r[:,i]).to(self.device)
#             pdb.set_trace()
            # Select action with the actor target and apply clipped noise
            noise = torch.FloatTensor(a[:,i,:]).data.normal_(0, self.policy_noise).to(self.device)
            noise = noise.clamp(-0.1,0.1) # NOISE CLIP WTF?
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
            # Compute the target Q value

            target_Q1, target_Q2 = self.critic_target(a_next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward.reshape(-1,1) + (done.reshape(-1,1) * self.discount * target_Q).detach()
            
            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(a_state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if self.policy_freq_it % self.policy_freq == 0:
                # Compute actor loss
                actor_loss = -self.critic.Q1(a_state, self.actor(state)).mean()
                # Optimize the actor 
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


            self.policy_freq_it += 1
        
        return True
        
    
    def reset(self):
        self.policy_freq_it = 0
        self.noise.reset()
Exemple #24
0
class Agent():
    ''' Interacts with and learns from the environment '''
    def __init__(self, num_agents, state_size, action_size, random_seed=2018):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.device = torch.device('cuda' if cuda else 'cpu')

        self.update = UPDATE_EVERY
        self.updates = NUMBER_OF_UPDATES

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)

    def step(self, state, action, reward, next_state, done, timestep):
        ''' Save experience in replay memory, and use random sample from buffer to learn '''

        # Save experience into memory __for each agent__
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # If we are in the timestep to update
        if timestep % self.update == 0:

            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:

                # Do learning "updates" times
                for _ in range(self.updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        ''' Returns actions for given state as per current policy '''
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))

        # Deactivate gradients and perform forward pass
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()

        if add_noise:
            for a in range(self.num_agents):
                actions[a, :] += self.noise.sample()
        # Clip action
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        ''' 
        Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        '''

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        # Get predicted next-state actions and Q values from target models  # Dimensions
        actions_next = self.actor_target(next_states)  # (BSx2)
        Q_targets_next = self.critic_target(next_states, actions_next)  #

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(
            states, actions_pred).mean()  # Average over the minibatch

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        ''' Soft update model parameters '''
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class A2C_v0(): # DOES NOT WORK STILL
    """
    Implements Advantage Actor Critic RL agent. Updates to be executed step by step.
    
    Notes
    -----
    GPU implementation is just sketched; it works but it's slower than with CPU.
    """
    
    def __init__(self, observation_space, action_space, lr_actor, lr_critic, gamma, 
                 device='cpu', discrete=False, project_dim=8):
        """
        Parameters
        ----------
        observation_space: int
            Number of flattened entries of the state
        action_space: int
            Number of (discrete) possible actions to take
        """
        
        self.gamma = gamma
        
        self.n_actions = action_space
        self.discrete = discrete
        if self.discrete:
            self.actor = DiscreteActor(observation_space, action_space, project_dim)
            self.critic = DiscreteCritic(observation_space, project_dim)
        else:
            self.actor = Actor(observation_space, action_space)
            self.critic = Critic(observation_space)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr_critic)
        
        self.device = device 
        ### Not implemented ###
        #self.actor.to(self.device) 
        #self.critic.to(self.device)
        
    def get_action(self, state, return_log=False):
        log_probs = self.forward(state)
        dist = torch.exp(log_probs)
        probs = Categorical(dist)
        action =  probs.sample().item()
        if return_log:
            return action, log_probs.view(-1)[action]
        else:
            return action
    
    def forward(self, state):
        if self.discrete:
            state = torch.from_numpy(state)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0) 
        log_probs = self.actor(state)
        return log_probs
         
    def update(self, reward, log_prob, state, new_state, done):
        # Wrap variables in tensors
        reward = torch.tensor(reward)
        if self.discrete:
            old_state = torch.tensor(state).unsqueeze(0)    
            new_state = torch.tensor(new_state).unsqueeze(0)
        else:
            old_state = torch.tensor(state).float().unsqueeze(0)    
            new_state = torch.tensor(new_state).float().unsqueeze(0)
        #log_prob = torch.tensor([log_prob]) # THIS DETACHES THE TENSOR!!
        log_prob = log_prob.view(1,1)
        # Update critic and then actor
        self.update_critic(reward, new_state, old_state, done)
        self.update_actor(reward, log_prob, new_state, old_state, done)
        return
    
    def update_critic(self, reward, new_state, old_state, done):
        # Predictions
        V_pred = self.critic(old_state).squeeze()
        #print("V_pred ", V_pred)
        # Targets
        V_trg = self.critic(new_state).squeeze()
        #print("V_trg (net) ", V_trg)
        # done = 1 if new_state is a terminal state
        V_trg = (1-done)*self.gamma*V_trg + reward
        V_trg = V_trg.detach()
        #print("V_trg (+r) ", V_trg)
        # MSE loss
        loss = (V_pred - V_trg).pow(2).sum()
        #print("loss ", loss)
        # backprop and update
        self.critic_optim.zero_grad()
        loss.backward()
        self.critic_optim.step()
        return
    
    def update_actor(self, reward, log_prob, new_state, old_state, done):
        # compute advantage
        A = (1-done)*self.gamma*self.critic(new_state).squeeze() + reward - self.critic(old_state).squeeze()
        #print("Advantage ", A)
        # compute gradient
        policy_gradient = - log_prob*A
        #print("policy_gradient ", policy_gradient)
        # backprop and update
        self.actor_optim.zero_grad()
        policy_gradient.backward()
        self.actor_optim.step()
        return
class DDPG():
    """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def train(self,
              env,
              brain_name,
              num_episodes=200,
              max_time=1000,
              print_every=10):
        """ Interacts with and learns from a given Unity Environment

        :param env: Unity Environment the agents is trying to learn
        :param brain_name: Brain for Environment
        :param num_episodes: Number of episodes to train
        :param max_time: How long each episode runs for
        :param print_every: How often in episodes to print a running average
        :return: Returns episodes scores and 100 episode averages as lists
        """
        # --------- Set Everything up --------#
        scores = []
        avg_scores = []
        scores_deque = deque(maxlen=print_every)

        # -------- Simulation Loop --------#
        for episode_num in range(1, num_episodes + 1):
            # Reset everything
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            episode_scores = np.zeros(self.num_agents)
            self.reset_noise()
            # Run the episode
            for t in range(max_time):
                actions = self.act(states, self.epsilon)
                env_info = env.step(actions)[brain_name]
                next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
                self.step(states, actions, rewards, next_states, dones)
                episode_scores += rewards
                states = next_states
                if np.any(dones):
                    break

            # -------- Episode Finished ---------#
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min)
            scores.append(np.mean(episode_scores))
            scores_deque.append(np.mean(episode_scores))
            avg_scores.append(np.mean(scores_deque))
            if episode_num % print_every == 0:
                print(
                    f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}'
                )
                torch.save(
                    self.actor_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
                torch.save(
                    self.critic_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')

        # -------- All Episodes finished Save parameters and scores --------#
        # Save Model Parameters
        torch.save(self.actor_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
        torch.save(self.critic_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')
        # Save mean score per episode (of the 20 agents)
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w')
        scores_string = "\n".join([str(score) for score in scores])
        f.write(scores_string)
        f.close()
        # Save average scores for 100 window average
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w')
        avgScores_string = "\n".join([str(score) for score in avg_scores])
        f.write(avgScores_string)
        f.close()
        return scores, avg_scores

    def step(self, states, actions, rewards, next_states, dones):
        """ what the agent needs to do for every time step that occurs in the environment. Takes
        in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not
        the same as a step in the environment. Step is only called once per environment time step.

        :param states: array of states agent used to select actions
        :param actions: array of actions taken by agents
        :param rewards: array of rewards for last action taken in environment
        :param next_states: array of next states after actions were taken
        :param dones: array of bools representing if environment is finished or not
        """
        # Save experienced in replay memory
        for agent_num in range(self.num_agents):
            self.memory.add(states[agent_num], actions[agent_num],
                            rewards[agent_num], next_states[agent_num],
                            dones[agent_num])

        # Learn "num_updates" times every "update_every" time step
        self.t_step += 1
        if len(self.memory
               ) > self.batch_size and self.t_step % self.update_every == 0:
            self.t_step = 0
            for _ in range(self.num_updates):
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, states, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.

        :param states: array of states from the environment
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_local.eval()  # Sets to eval mode (no gradients)
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()  # Sets to train mode (gradients back on)
        if add_noise and epsilon > np.random.random():
            actions += [self.noise.sample() for _ in range(self.num_agents)]
        return np.clip(actions, -1, 1)

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)