Ejemplo n.º 1
0
    def __init__(self, name, mlp_model, lstm_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())

        # LSTM placeholders
        p_res = 7
        q_res = 1

# set up initial states
        self.q_c, self.q_h = create_init_state(num_batches=1, len_sequence=args.num_units)
        self.p_c, self.p_h = create_init_state(num_batches=1, len_sequence=args.num_units)

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_LSTM_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=lstm_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        self.act, self.p_train, self.p_update, self.p_debug = p_LSTM_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=lstm_model,
            q_func=lstm_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            q_debug=self.q_debug
        )
        # Create experience buffer
        self.replay_buffer = ReplayBufferLSTM(1e6)
        # self.replay_buffer = PrioritizedReplayBuffer(10000, 0.45)
        # self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size
        self.replay_sample_index = None

        # Information tracking
        self.tracker = InfoTracker(self.name, self.args)
Ejemplo n.º 2
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer

        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        # set up tracking
        self.tracker = InfoTracker(self.name, self.args)
Ejemplo n.º 3
0
    def __init__(self,
                 name,
                 mlp_model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # LSTM placeholders
        p_res = 7
        q_res = 1

        # set up initial states
        self.q_c, self.q_h = create_init_state(num_batches=1,
                                               len_sequence=args.num_units)
        self.p_c, self.p_h = create_init_state(num_batches=1,
                                               len_sequence=args.num_units)

        q_model = lstm_model if self.args.critic_lstm else mlp_model
        p_model = lstm_model if self.args.actor_lstm else mlp_model

        # Just to verify:
        print("Q model: {} because critic_lstm: {}".format(
            q_model, self.args.critic_lstm))
        print("P model: {} because actor_lstm: {}".format(
            p_model, self.args.actor_lstm))

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = _q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm,
            centralized_p=self.args.centralized_actor,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_debug = _p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=p_model,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm,
            centralized_p=self.args.centralized_actor,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            q_debug=self.q_debug)

        # number of args for replay buffer
        self.experience_size = 6  # o, a, e, o', done, new_ep
        if self.args.actor_lstm:
            self.experience_size += 4  # c_in, h_in, _out..
        if self.args.critic_lstm:
            self.experience_size += 4

        # Create experience buffer
        self.replay_buffer = _ReplayBuffer(
            size=1e6,
            experience_size=self.experience_size,
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm)
        # self.replay_buffer = ReplayBufferLSTM(1e6)
        # self.replay_buffer = PrioritizedReplayBuffer(10000, 0.45)
        # self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size
        self.replay_sample_index = None

        # Information tracking
        self.tracker = InfoTracker(self.name, self.args)
Ejemplo n.º 4
0
class _RMADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 mlp_model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # LSTM placeholders
        p_res = 7
        q_res = 1

        # set up initial states
        self.q_c, self.q_h = create_init_state(num_batches=1,
                                               len_sequence=args.num_units)
        self.p_c, self.p_h = create_init_state(num_batches=1,
                                               len_sequence=args.num_units)

        q_model = lstm_model if self.args.critic_lstm else mlp_model
        p_model = lstm_model if self.args.actor_lstm else mlp_model

        # Just to verify:
        print("Q model: {} because critic_lstm: {}".format(
            q_model, self.args.critic_lstm))
        print("P model: {} because actor_lstm: {}".format(
            p_model, self.args.actor_lstm))

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = _q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm,
            centralized_p=self.args.centralized_actor,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_debug = _p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=p_model,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm,
            centralized_p=self.args.centralized_actor,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            q_debug=self.q_debug)

        # number of args for replay buffer
        self.experience_size = 6  # o, a, e, o', done, new_ep
        if self.args.actor_lstm:
            self.experience_size += 4  # c_in, h_in, _out..
        if self.args.critic_lstm:
            self.experience_size += 4

        # Create experience buffer
        self.replay_buffer = _ReplayBuffer(
            size=1e6,
            experience_size=self.experience_size,
            q_lstm_on=self.args.critic_lstm,
            p_lstm_on=self.args.actor_lstm)
        # self.replay_buffer = ReplayBufferLSTM(1e6)
        # self.replay_buffer = PrioritizedReplayBuffer(10000, 0.45)
        # self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size
        self.replay_sample_index = None

        # Information tracking
        self.tracker = InfoTracker(self.name, self.args)

    def reset_lstm(self):
        self.q_c, self.q_h = create_init_state(num_batches=1,
                                               len_sequence=self.q_h.shape[-1])
        self.p_h, self.p_h = create_init_state(num_batches=1,
                                               len_sequence=self.p_h.shape[-1])

    def action(self, obs):
        if self.args.actor_lstm:
            action, state = self.act(*[obs[None], self.p_c, self.p_h])
            self.p_c, self.p_h = state
        else:
            action = self.act(obs[None])

        action = action[0]
        if self.args.tracking:
            self.tracker.record_information("communication",
                                            np.argmax(action[0][-2:]))
        return action

    def experience(self, *args):
        assert len(
            args
        ) == self.experience_size, "Got {} experience args, but expected {}".format(
            len(args), self.experience_size)
        # Store transition in the replay buffer.
        self.replay_buffer.add(*args)

    def preupdate(self, inds):
        self.replay_sample_index = inds

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.tracker.start()

        if self.replay_sample_index is None:
            self.replay_sample_index = self.replay_buffer.make_index_lstm(
                self.args.batch_size)
            # raise ValueError("Didn't want to resample indices")

        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        p_c_in, p_h_in = [], []
        p_c_out, p_h_out = [], []
        q_c_in, q_h_in = [], []
        q_c_out, q_h_out = [], []

        index = self.replay_sample_index
        for i in range(self.n):
            # if self.args.actor_lstm and self.args.critic_lstm:
            # print("getting both lstm states")
            obs, act, rew, obs_next, done, p_c_in_t, p_h_in_t, p_c_out_t, p_h_out_t, q_c_in_t, q_h_in_t, q_c_out_t, q_h_out_t = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)

            if self.args.actor_lstm:
                p_c_in.append(p_c_in_t)
                p_h_in.append(p_h_in_t)
                p_c_out.append(p_c_out_t)
                p_h_out.append(p_h_out_t)
            if self.args.critic_lstm:
                q_c_in.append(q_c_in_t)
                q_h_in.append(q_h_in_t)
                q_c_out.append(q_c_out_t)
                q_h_out.append(q_h_out_t)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            # target actor
            if self.args.actor_lstm:
                target_act_next_n = [
                    agents[i].p_debug['target_act'](obs_next_n[i], p_c_out[i],
                                                    p_h_out[i])
                    for i in range(self.n)
                ]  # next lstm state
            else:
                target_act_next_n = [
                    agents[i].p_debug['target_act'](obs_next_n[i])
                    for i in range(self.n)
                ]

            # target critic
            if self.args.critic_lstm:
                target_q_next = self.q_debug['target_q_values'](
                    *(obs_next_n + target_act_next_n + q_c_out +
                      q_h_out))  # take in next lstm state
            else:
                target_q_next = self.q_debug['target_q_values'](
                    *(obs_next_n + target_act_next_n))

            rew = np.reshape(rew, target_q_next.shape)
            done = np.reshape(done, target_q_next.shape)
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample

        if self.args.critic_lstm and self.args.actor_lstm:
            q_loss = self.q_train(*(obs_n + act_n + q_c_in + q_h_in +
                                    [target_q]))  # past p, q vals
            p_loss = self.p_train(*(obs_n + act_n + q_c_in + q_h_in + p_c_in +
                                    p_h_in))
        elif self.args.critic_lstm:
            q_loss = self.q_train(*(obs_n + act_n + q_c_in + q_h_in +
                                    [target_q]))  # past p, q vals
            p_loss = self.p_train(*(obs_n + act_n + q_c_in + q_h_in))
        elif self.args.actor_lstm:
            q_loss = self.q_train(*(obs_n + act_n +
                                    [target_q]))  # past p, q vals
            p_loss = self.p_train(*(obs_n + act_n + p_c_in + p_h_in))
        else:
            q_loss = self.q_train(*(obs_n + act_n +
                                    [target_q]))  # past p, q vals
            p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        if self.args.tracking:
            self.tracker.record_information("q_loss", q_loss)
            self.tracker.record_information("p_loss", p_loss)
            self.tracker.record_information("target_q_mean", np.mean(target_q))
            self.tracker.record_information("reward_mean", np.mean(rew))
            self.tracker.record_information("target_q_next_mean",
                                            np.mean(target_q_next))
            self.tracker.record_information("target_q_std", np.std(target_q))

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
Ejemplo n.º 5
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer

        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        # set up tracking
        self.tracker = InfoTracker(self.name, self.args)

    def action(self, obs):
        action = self.act(obs[None])[0]
        if self.args.tracking:
            self.tracker.record_information("communication",
                                            np.argmax(action[0][-2:]))
        return action

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.tracker.start()

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            rew = np.reshape(rew, target_q_next.shape)
            done = np.reshape(done, target_q_next.shape)
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        if self.args.tracking:
            self.tracker.record_information("q_loss", q_loss)
            self.tracker.record_information("p_loss", p_loss)
            self.tracker.record_information("target_q_mean", np.mean(target_q))
            self.tracker.record_information("reward_mean", np.mean(rew))
            self.tracker.record_information("target_q_next_mean",
                                            np.mean(target_q_next))
            self.tracker.record_information("target_q_std", np.std(target_q))

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]