Exemple #1
0
class I3QLearner():
    def __init__(self, num_features, num_actions, timestep, action_space, scope):
        self.scope = scope
        self._lr = 0.5
        self.discount = 1.
        self.replay_buffer = ReplayBuffer(1e4)

        with tf.variable_scope(self.scope):
            self.act_trajectory = tf.placeholder(tf.float32, shape = ((None, timestep, action_space)))
            self.target = tf.placeholder(tf.float32, shape = ((None, )))
            self.act = tf.placeholder(tf.int32, shape = ((None,)))

            self.tau = lstm_model(self.act_trajectory, num_actions, scope = "tau_model_{}".format(scope))
            self.q_input = self.tau
            #train network
            self.q = mlp_model(self.q_input, 2, scope = "q_model_{}".format(scope))
            q_func_vars = U.scope_vars(U.absolute_scope_name( "q_model_{}".format(scope)))
            #target network
            self.target_q = mlp_model(self.q_input, 2, scope = "target_q_model_{}".format(scope))
            target_q_func_vars = U.scope_vars(U.absolute_scope_name( "target_q_model_{}".format(scope)))

            # take action
            self.softmax = tf.nn.softmax(self.target_q)
            self.pred = tf.argmax(self.softmax, axis = 1)

            #calculate the loss
            self.q_t_selected = tf.reduce_mean(self.q * tf.one_hot(self.act, num_actions), 1)
            q_tp1_best = tf.reduce_max(self.q, 1)
            q_tp1_best_masked =  q_tp1_best
            td_error = self.q_t_selected - tf.stop_gradient(self.target)
            self.errors = U.huber_loss(td_error)
            self.q_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.errors, var_list = q_func_vars)

            self.tau_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.tau, labels=self.act))
            self.tau_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.tau_loss)

            self.get_pred = U.function(inputs = [self.act_trajectory] , outputs = [self.softmax])
            self.train_q = U.function(inputs = [self.act_trajectory] + [self.target] +[self.act] , outputs = [self.errors, self.q], updates = [self.q_opt_op])
            self.train_tau = U.function(inputs =[ self.act] + [self.act_trajectory], outputs = [self.tau_loss], updates =[ self.tau_opt_op ])
            self.update_model = make_update_exp(q_func_vars, target_q_func_vars)

    def experience(self, action1, act_tra1 , reward1):
        self.replay_buffer.add(action1, act_tra1 , reward1)

    # bolzman exploration policy, set the temperature parmeter = 1 for default
    def get_act(self, act_trajectory):

        acpd = self.get_pred(act_trajectory)[0][0]
        # action = np.random.choice([0,1], p = acpd)
        action = epsilon_greedy(acpd, 0.1)
        return action

    def supervise_tau(self, a_next, action_trajectory):

        loss =  self.train_tau(*([a_next] + [action_trajectory]))[0]
        return loss
    def update_target(self):
        self.update_model()

    def learn(self, batch_size):

        replay_sample_index = self.replay_buffer.make_index(batch_size)
        act, act_tra, reward = self.replay_buffer.sample_index(replay_sample_index)
        loss , q= self.train_q(*([act_tra] + [reward] + [act]))
        return loss, q
Exemple #2
0
class Agent:

    def __init__(self, pos, actor, critic, actor_target, critic_target, train_mode, discrete_action, args,
                 alg_mode='MADDPG'):

        self.pos = pos
        self.BATCH_SIZE = args.batch_size
        self.GAMMA = args.GAMMA
        self.args = args
        self.train_mode = train_mode
        self.discrete_action = discrete_action
        self.algorithm = alg_mode

        self.critic = critic
        self.critic_target = critic_target

        self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(5, ))

        self.actor = actor
        self.actor_target = actor_target

        self.actor_target.hard_copy(actor)
        self.critic_target.hard_copy(critic)

        self.replay_buffer = ReplayBuffer(int(1e6))
        self.max_replay_buffer_len = self.BATCH_SIZE * 25

    def preupdate(self):
        self.replay_sample_index = None

    def step(self, agents, t, terminal):

        if len(self.replay_buffer) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.BATCH_SIZE)

        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for agent in agents:
            obs, act, rew, obs_next, done = agent.replay_buffer.sample_index(index)
            obs_n.append(torch.FloatTensor(obs).to(device))
            obs_next_n.append(torch.FloatTensor(obs_next).to(device))
            act_n.append(torch.FloatTensor(act).to(device))

        state_batch, action_batch, reward_batch, state_next_batch, t_batch = self.replay_buffer.sample_index(index)

        state_batch = torch.FloatTensor(state_batch).to(device)
        action_batch = torch.FloatTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        t_batch = torch.FloatTensor(t_batch).to(device)
        state_next_batch = torch.FloatTensor(state_next_batch).to(device)
        reward_batch = torch.reshape(reward_batch, (1024, 1))
        t_batch = torch.reshape(t_batch, (1024, 1))

        # Train the critic network.
        if self.algorithm == 'MADDPG':
            if self.discrete_action:
                target_actions = [onehot_from_logits(agent.actor_target(nobs)) for agent, nobs in
                                  zip(agents, obs_next_n)]
            else:
                target_actions = [agent.actor_target(nobs) for agent, nobs in zip(agents, obs_next_n)]

            obs_next_concat = torch.cat(obs_next_n, dim=-1)
            target_actions = torch.cat(target_actions, dim=-1)
        else:  # Get actions in DDPG mode.
            if self.discrete_action:
                target_actions = onehot_from_logits(self.actor_target(state_next_batch))
            else:
                target_actions = self.actor_target(state_next_batch)
            obs_next_concat = state_next_batch

        predicted_q_value = self.critic_target(obs_next_concat, target_actions)
        Q_targets = reward_batch + ((1 - t_batch) * self.GAMMA * predicted_q_value).detach()

        if self.algorithm == 'MADDPG':
            obs_concat = torch.cat(obs_n, dim=-1)
            action_concat = torch.cat(act_n, dim=-1)
        else:
            obs_concat = state_batch
            action_concat = action_batch

        self.critic.train_step(obs_concat, action_concat, Q_targets)

        all_actions = []
        if self.discrete_action:
            curr_pol_out = self.actor(state_batch)
            curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True)
        else:
            curr_pol_out = self.actor(state_batch)
            curr_pol_vf_in = curr_pol_out

        if self.algorithm == 'MADDPG':  # Get the actions of all actors in MADDPG mode.
            for i, agent, obs in zip(range(len(agents)), agents, obs_n):
                if i == self.pos:
                    all_actions.append(curr_pol_vf_in)
                elif self.discrete_action:
                    all_actions.append(onehot_from_logits(agent.actor(obs)))
                else:
                    all_actions.append(agent.actor(obs))
            actions_concatenated = torch.cat(all_actions, dim=-1)
        else:  # Get ONLY the action of the current actor in DDPG.
            actions_concatenated = curr_pol_vf_in

        self.actor.train_step(self.critic, obs_concat, actions_concatenated, curr_pol_out)

        self.soft_update(self.actor, self.actor_target, tau=self.args.tau)
        self.soft_update(self.critic, self.critic_target, tau=self.args.tau)

    def experience(self, obs, act, rew, new_obs, done):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, done)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.FloatTensor(state).unsqueeze(0).to(device)

        noise = self.noise()
        noise = torch.FloatTensor(noise).unsqueeze(0).to(device)
        action = self.actor(state)

        if self.discrete_action:
            if add_noise:
                action = gumbel_softmax(action, hard=True)
            else:
                action = onehot_from_logits(action)
        else:
            if add_noise:
                action = action + noise
            action = action.clamp(-1, 1)

        action = action.cpu().detach().numpy()[0]
        return action

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
        local_model: PyTorch model (weights will be copied from)
        target_model: PyTorch model (weights will be copied to)
        tau (float): interpolation parameter
    """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)