class AlgoA2C(AlgoBase):
    def __init__(self, num_state, num_action, configDict, train=True):
        super(AlgoA2C, self).__init__(num_state,
                                      num_action,
                                      configDict,
                                      createResults=False)

        # parameters of Internal DRL algorithm:
        ## Memory:
        self.MEMORY_CAPACITY = 100000
        self.GAMMA = 0.95
        ## Deep network:
        self.MEMORY_BATCH_SIZE = 64  # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY)

        self.train = train
        if train:
            ## RL algorithm:
            ## Random selection proportion:
            self.MAX_EPSILON = 1.0
            self.MIN_EPSILON = 0.01
            self.LAMBDA = 0.005  # speed of decay
            self.epsilon = self.MAX_EPSILON
        else:
            self.epsilon = 0.0

        self.brain = Brain(num_state,
                           num_action,
                           configDict,
                           RL_GAMMA=self.GAMMA)

        self.memory = ExperienceReplay(self.MEMORY_CAPACITY)
        self.next_model(configDict)

    def next_model(self, configDict, load=False):
        super(AlgoA2C, self).next_model(configDict, load)
        self.brain.set_model(configDict)

    def load(self):
        loaded = self.brain.load()
        self.resultFile.Load()
        if loaded:
            self.episodes = self.resultFile.NumRuns()

    def act(self, state):  # action:[0,1,2,...,num_action-1]
        if random.random() < self.epsilon:
            action = random.randint(0, self.num_action - 1)
        else:
            action = np.argmax(
                self.brain.predictOne(state_test=state)
            )  # get the index of the largest number, that is the action we should take. -libn

        return action

    def observe(self, s, a, r, s_, done):
        self.memory.add(experience)

        # decrease Epsilon to reduce random action and trust more in greedy algorithm

    def end_episode(self, r, sumR, steps, realR):
        self.epsilon = self.MIN_EPSILON + (self.MAX_EPSILON -
                                           self.MIN_EPSILON) * math.exp(
                                               -self.LAMBDA * self.episodes)
        self.episodes += 1
        saveModel = self.resultFile.end_run(r, sumR, steps, realR)
        if saveModel:
            self.brain.save_latest()

        return saveModel, ""

    def replay(self):
        pass

    def learn(self):

        size = self.memory.num_experience()

        allHist = self.memory.sample(self.memory.num_experience())
        no_state = np.zeros(self.num_state)

        s = np.array([o[0] for o in batch])
        s_ = np.array([(no_state if o[3] is None else o[3]) for o in batch])

        a = [int(o[1]) for o in batch]
        r = [int(o[2]) for o in batch]

        notDone = [False if o[3] is None else True for o in batch]

        idxHist = np.arange(self.MEMORY_BATCH_SIZE)

        v = self.brain.predict(s)
        v_ = self.brain.predict(s_)

        # inputs and outputs of the Deep Network:
        x = np.zeros((size, self.num_state))
        y = np.zeros((size, self.num_action))

        y = r + self.GAMMA * notDone * np.amax(v_)

        for e in numEpochs:

            for i in range(len_batch):
                o = batch[i]
                s = o[0]
                a = int(o[1])
                r = o[2]
                s_ = o[3]

                v_t = v[i]
                if s_ is None:
                    v_t[a] = r
                else:
                    v_t[a] = r + self.GAMMA * np.amax(
                        v_[i]
                    )  # We will get max reward if we select the best option.

                x[i] = s
                y[i] = v_t

        self.brain.train(x, y, batch_size=len_batch)

    def Results(self, size):
        return self.resultFile.Results(size)
class Agent():
    def __init__(self, state_size, action_size, num_agents, seed, \
                 gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
                 buffer_size = 10e5, buffer_type = 'replay', policy_update = 1):
        # General info
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.t_step = 0
        self.gamma = gamma
        # Actor Network -- Policy-based
        self.actor = DDPG_Actor(state_size,
                                action_size,
                                hidden_dims=(128, 128),
                                seed=seed)
        self.target_actor = DDPG_Actor(state_size,
                                       action_size,
                                       hidden_dims=(128, 128),
                                       seed=seed)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        # Critic Network -- Value-based
        self.critic = DDPG_Critic(state_size,
                                  action_size,
                                  hidden_dims=(128, 128),
                                  seed=seed)
        self.target_critic = DDPG_Critic(state_size,
                                         action_size,
                                         hidden_dims=(128, 128),
                                         seed=seed)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)
        self.tau = tau
        # Replay memory
        self.buffer_type = buffer_type
        self.memory = ExperienceReplay(action_size,
                                       int(buffer_size))  #ExperienceReplay
        self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                               alpha=0.6,
                                               beta=0.9,
                                               error_offset=0.001)
        # NormalNoiseStrategy
        self.normal_noise = NormalNoiseStrategy()
        # Delayed Updates from TD3
        self.policy_update = policy_update

    def select_action(self, state):
        return self.normal_noise.select_action(self.actor, state)

    def select_action_evaluation(self, state):
        return self.actor(state).cpu().detach().data.numpy().squeeze()

    def _critic_error(self, state, action, reward, next_state, done):
        done = int(done)
        reward = float(reward)
        with torch.no_grad():
            argmax_a = self.target_actor(next_state)
            q_target_next = self.target_critic(next_state, argmax_a)
            q_target = reward + (self.gamma * q_target_next * (1 - done))
            q_expected = self.critic(state, action)
            td_error = q_expected - q_target.detach()
        return td_error.detach().numpy()

    def step(self, state, action, reward, next_state, done, batch_size=64):
        self.t_step += 1
        if self.buffer_type == 'prioritized':
            if self.num_agents == 20:
                reward = np.asarray(reward)[:, np.newaxis]
                done = np.asarray(done)[:, np.newaxis]
                for i in range(self.num_agents):
                    error = self._critic_error(state[i], action[i], reward[i],
                                               next_state[i], done[i])
                    self.per.add(error, (state[i], action[i], reward[i],
                                         next_state[i], done[i]))
            else:
                done = np.asarray(done)
                reward = np.asarray(reward)
                state = state.squeeze()
                next_state = next_state.squeeze()
                error = self._critic_error(state, action, reward, next_state,
                                           done)
                self.per.add(error, (state, action, reward, next_state, done))

            # train if enough samples
            if self.t_step > batch_size:
                experiences, mini_batch, idxs, is_weights = self.per.sample(
                    batch_size)
                self.learn(experiences, batch_size, idxs, is_weights)

        # add to replay buffer
        else:
            if self.num_agents == 20:
                reward = np.asarray(reward)[:, np.newaxis]
                done = np.asarray(done)[:, np.newaxis]
                for i in range(self.num_agents):
                    self.memory.add(state[i], action[i], reward[i],
                                    next_state[i], done[i])
            else:
                self.memory.add(state, action, reward, next_state, done)
            # train if enough samples
            if len(self.memory) > batch_size:
                experiences = self.memory.sample(batch_size)
                self.learn(experiences, batch_size)

    def learn(self, experiences, batch_size, idxs=0, is_weights=0):
        states, actions, rewards, next_states, dones = experiences

        # *** 1. UPDATE Online Critic Network ***
        # 1.1. Calculate Targets for Critic
        argmax_a = self.target_actor(next_states)
        q_target_next = self.target_critic(next_states, argmax_a)
        q_target = rewards + (self.gamma * q_target_next * (1 - dones))
        q_expected = self.critic(states, actions)
        # 1.2. Compute loss
        td_error = q_expected - q_target.detach()

        if self.buffer_type == 'prioritized':
            # PER --> update priority
            with torch.no_grad():
                error = td_error.detach().numpy()
                for i in range(batch_size):
                    idx = idxs[i]
                    self.per.update(idx, error[i])
            value_loss = (torch.FloatTensor(is_weights) *
                          td_error.pow(2).mul(0.5)).mean()
        else:
            value_loss = td_error.pow(2).mul(0.5).mean()
            # value_loss = F.mse_loss(q_expected,q_target)
        # 1.3. Update Critic
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        if self.t_step % self.policy_update == 0:
            """
                Delaying Target Networks and Policy Updates from:
                ***Addressing Function Approximation Error in Actor-Critic Methods***
            """
            # *** 2. UPDATE Online Actor Network ***
            argmax_a = self.actor(states)
            max_val = self.critic(states, argmax_a)
            policy_loss = -max_val.mean(
            )  # add minus because its gradient ascent
            # Update Actor
            self.actor_optimizer.zero_grad()
            policy_loss.backward()
            # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
            self.actor_optimizer.step()

            # 3. UPDATE TARGET networks
            self.soft_update(self.actor, self.target_actor, self.tau)
            self.soft_update(self.critic, self.target_critic, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #3
0
class DQNAgent:

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def __init__(self,
                 osize,
                 asize,
                 seed,
                 buffersize=int(1e6),
                 gamma=0.99,
                 epsilon=0.05,
                 epsilondecay=1e6,
                 epsilonmin=0.1,
                 minibatchsize=128,
                 lr=0.01,
                 tau=0.01):
        """
        Initialize DQN agent parameters.
        """

        # initialize agent parameters
        self.osize = osize
        self.asize = asize
        self.gamma = gamma
        self.epsilon0 = epsilon
        self.epsilon = epsilon
        self.epsilondecay = epsilondecay
        self.epsilonmin = epsilonmin
        self.minibatchsize = minibatchsize
        self.lr = lr
        self.tau = tau
        self.stepcount = 0
        self.loss_log = []

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Q = QNetwork(osize, asize).to(self.device)
        self.targetQ = QNetwork(osize, asize).to(self.device)

        # initialize optimizer
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # initialize experience replay
        self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)

    def step(self, state, action, reward, next_state, done):
        """
        Step the agent, and learn if necessary.
        """

        # add experience to replay
        self.replay.add(state, action, reward, next_state, done)

        # learn from experiences
        if self.replay.__len__() > self.minibatchsize:
            # create mini batch for learning
            experiences = self.replay.sample(self.device)
            # train the agent
            self.learn(experiences)

        # increase step count
        self.stepcount += 1

        # decay epsilon
        decayed_epsilon = self.epsilon * (1 - self.epsilondecay)
        self.epsilon = max(self.epsilonmin, decayed_epsilon)

    def get_action(self, state):
        """
        Get an epsilon greedy action.
        """

        # convert network input to torch variable
        x = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # obtain network output
        self.Q.eval()
        with torch.no_grad(
        ):  # do not calculate network gradients which will speed things up
            y = self.Q(x)
        self.Q.train()

        # select action
        if random.random() > self.epsilon:
            # epsilon greedy action
            action = np.argmax(
                y.cpu().data.numpy())  # action is actually action index
        else:
            # random action selection
            action = np.random.choice(np.arange(self.asize))

        return action

    def learn(self, experiences):
        """
        Learn using Double DQN algorithm.
        """

        # unpack experience
        states, actions, rewards, next_states, dones = experiences

        # get the argmax of Q(next_state)
        a_max = torch.argmax(self.Q(next_states),
                             dim=1).cpu().data.numpy().reshape(
                                 (self.minibatchsize, 1))

        # obtain the target Q network output
        target_out = self.targetQ(next_states).detach().data.numpy()
        target_q = np.array(
            [tout[aidx] for tout, aidx in zip(target_out, a_max)])

        # calculate target and local Qs
        target = rewards + self.gamma * target_q * (1 - dones)
        local = self.Q(states).gather(1, actions)

        # calculate loss
        loss = F.mse_loss(local, target)
        self.loss_log.append(loss.cpu().data.numpy())

        # perform gradient descent step
        self.optimizer.zero_grad()  # reset the gradients to zero
        loss.backward()
        self.optimizer.step()

        # soft update target network
        for target_params, params in zip(self.targetQ.parameters(),
                                         self.Q.parameters()):
            target_params.data.copy_(self.tau * params +
                                     (1 - self.tau) * target_params.data)
class MADDPG_Agent():
    def __init__(self, state_size, action_size, num_agents, \
                 gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
                 buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \
                 noise_init = 1.0, noise_decay=0.9995, min_noise=0.1):
        # General info
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.t_step = 0
        self.gamma = gamma
        # Actor Networks -- Policy-based
        self.actors = [
            DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
            for i in range(num_agents)
        ]
        self.actor_optimizers = [
            optim.Adam(actor.parameters(), lr=lr_actor)
            for actor in self.actors
        ]
        # targets
        self.target_actors = [
            DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
            for i in range(num_agents)
        ]
        [
            self.hard_update(self.actors[i], self.target_actors[i])
            for i in range(num_agents)
        ]
        # Critic Network -- Value-based --> in this approach we will use one common network for all the actors
        self.critic = DDPG_Critic(state_size,
                                  action_size,
                                  hidden_dims=(128, 128))
        self.target_critic = DDPG_Critic(state_size,
                                         action_size,
                                         hidden_dims=(128, 128))
        self.hard_update(self.critic, self.target_critic)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)
        # How to update networks
        self.tau = tau
        self.policy_update = policy_update
        # Replay memory
        self.buffer_type = buffer_type
        self.memory = ExperienceReplay(action_size,
                                       int(buffer_size))  #ExperienceReplay
        self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                               alpha=0.6,
                                               beta=0.9,
                                               error_offset=0.001)
        # NormalNoiseStrategy
        self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\
                                                noise_decay=noise_decay,\
                                                min_noise_ratio = min_noise)

    def select_action(self, state):
        actions = []
        for i in range(self.num_agents):
            actions.append(
                self.normal_noise.select_action(self.actors[i], state[i]))
        return np.array(actions)

    def select_action_evaluation(self, state):
        actions = []
        for i in range(self.num_agents):
            actions.append(self.actors[i](
                state[i]).cpu().detach().data.numpy().squeeze())
        return np.array(actions)

    def _critic_error(self, state, action, reward, next_state, done):
        states = torch.Tensor(state).view(-1, self.num_agents *
                                          self.state_size)  # batch X 2*24
        next_states = torch.Tensor(next_state).view(
            -1, self.num_agents * self.state_size)  # batch X 2*24
        actions = torch.Tensor(action).view(-1, self.num_agents *
                                            self.action_size)  # batch X 2*2
        rewards = torch.Tensor(reward).view(-1, self.num_agents * 1)
        dones = torch.Tensor(done.astype(int)).view(-1, self.num_agents * 1)

        with torch.no_grad():
            # 1.1. Calculate Target
            target_actions = []
            for i in range(self.num_agents):
                target_actions.append(self.target_actors[i](
                    next_states[:, self.state_size * i:self.state_size *
                                (i + 1)]))
            target_actions = torch.stack(
                target_actions
            )  # shape: 2(num_agents) x batch x 2(num_actions)
            target_actions = target_actions.permute(
                1, 0,
                2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
            target_actions = target_actions.contiguous().view(
                -1, self.num_agents * self.action_size)  # batch_size X 2*2
            q_target_next = self.target_critic(next_states, target_actions)

            q_target = rewards + (
                self.gamma * q_target_next * (1 - dones)
            )  # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent)
            # 1.2. Expected
            q_expected = self.critic(states, actions)
            # 1.3. Compute loss
            td_error = q_expected - q_target.detach()
        return td_error.mean().detach().numpy()

    def step(self, state, action, reward, next_state, done, batch_size=64):
        self.t_step += 1  #increment number of visits
        # transform to np.array with proper shapes
        reward = np.asarray(reward)[:, np.newaxis]
        done = np.asarray(done)[:, np.newaxis]
        # add experiences to buffer(PER | Replay) and learn in case of having enough samples
        if self.buffer_type == 'prioritized':
            for i in range(self.num_agents):
                error = self._critic_error(state, action, reward, next_state,
                                           done)
                self.per.add(error, (state, action, reward, next_state, done))
            # train if enough samples
            if self.t_step > batch_size:
                experiences, mini_batch, idxs, is_weights = self.per.sample(
                    batch_size)
                self.learn(experiences, batch_size, idxs, is_weights)
        else:  #replaybuffer
            self.memory.add(state, action, reward, next_state, done)
            # train if enough samples
            if len(self.memory) > batch_size:
                experiences = self.memory.sample(batch_size)
                c_loss, a_loss = self.learn(experiences, batch_size)
            else:
                c_loss, a_loss = torch.Tensor([0]), (torch.Tensor([0]),
                                                     torch.Tensor([0]))
        return c_loss, a_loss

    def _update_critic_network(self, experiences, batch_size, idxs,
                               is_weights):
        states, actions, rewards, next_states, dones = experiences
        # s,s' --> 64x2x24
        # a --> 64x2x2
        # r,w --> 64x2x1

        # transform to proper shape for the network --> batch_size X expected value
        states = states.view(-1,
                             self.num_agents * self.state_size)  # batch X 2*24
        next_states = next_states.view(-1, self.num_agents *
                                       self.state_size)  # batch X 2*24
        actions = actions.view(-1, self.num_agents *
                               self.action_size)  # batch X 2*2
        rewards = rewards.view(-1, self.num_agents * 1)
        dones = dones.view(-1, self.num_agents * 1)

        # 1.1. Calculate Target
        target_actions = []
        for i in range(self.num_agents):
            target_actions.append(self.target_actors[i](
                next_states[:, self.state_size * i:self.state_size * (i + 1)]))
        target_actions = torch.stack(
            target_actions)  # shape: 2(num_agents) x batch x 2(num_actions)
        # transform to proper shape
        target_actions = target_actions.permute(
            1, 0,
            2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
        target_actions = target_actions.contiguous().view(
            -1, self.num_agents * self.action_size)  # batch_size X 2*2

        q_target_next = self.target_critic(next_states, target_actions)

        q_target = rewards + (
            self.gamma * q_target_next * (1 - dones)
        )  # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent)
        # 1.2. Expected
        q_expected = self.critic(states, actions)
        # 1.3. Compute loss
        td_error = q_expected - q_target.detach()

        if self.buffer_type == 'prioritized':
            # PER --> update priority
            with torch.no_grad():
                error = td_error.detach().numpy()
                for i in range(batch_size):
                    idx = idxs[i]
                    self.per.update(idx, error[i])
            value_loss = (torch.FloatTensor(is_weights) *
                          td_error.pow(2).mul(0.5)).mean()
        else:
            value_loss = td_error.pow(2).mul(0.5).mean()
            # value_loss = F.mse_loss(q_expected,q_target)
        # 1.4. Update Critic
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        return value_loss

    def _update_actor_networks(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # transform to proper shape for the network --> batch_size X expected value
        states = states.view(-1,
                             self.num_agents * self.state_size)  # batch X 2*24
        next_states = next_states.view(-1, self.num_agents *
                                       self.state_size)  # batch X 2*24
        actions = actions.view(-1, self.num_agents *
                               self.action_size)  # batch X 2*2
        rewards = rewards.view(-1, self.num_agents * 1)
        dones = dones.view(-1, self.num_agents * 1)

        policy_losses = []
        for ID_actor in range(self.num_agents):
            # load network and optimizer
            optimizer = self.actor_optimizers[ID_actor]
            actor = self.actors[ID_actor]

            q_input_actions = []
            for i in range(self.num_agents):
                q_input_actions.append(
                    actor(states[:, self.state_size * i:self.state_size *
                                 (i + 1)]))  #only states of the current agent
            q_input_actions = torch.stack(q_input_actions)
            # transform to proper shape
            q_input_actions = q_input_actions.permute(
                1, 0,
                2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
            q_input_actions = q_input_actions.contiguous().view(
                -1, self.num_agents * self.action_size)  # batch_size X 2*2

            max_val = self.critic(states, q_input_actions)
            policy_loss = -max_val.mean(
            )  # add minus because its gradient ascent
            policy_losses.append(policy_loss)

            optimizer.zero_grad()
            policy_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actors[ID_actor].parameters(),
                                           1)
            optimizer.step()

            # save new network and optimizer state
            self.actor_optimizers[ID_actor] = optimizer
            self.actors[ID_actor] = actor

        return policy_losses[0], policy_losses[1]

    def learn(self, experiences, batch_size, idxs=0, is_weights=0):
        # *** 1. UPDATE Online Critic Network ***
        critic_loss = self._update_critic_network(experiences, batch_size,
                                                  idxs, is_weights)
        if self.t_step % self.policy_update == 0:
            # *** 2. UPDATE Online Actor Networks ***
            actor_loss = self._update_actor_networks(experiences)
            # *** 3. UPDATE TARGET/Offline networks ***
            for i in range(self.num_agents):
                self.soft_update(self.actors[i], self.target_actors[i],
                                 self.tau)
            self.soft_update(self.critic, self.target_critic, self.tau)
        return critic_loss, actor_loss

    def hard_update(self, local_model, target_model):
        """Hard update model parameters. Copy the values of local network into the target.
        θ_target = θ_local

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)