Ejemplo n.º 1
0
 def test_len(self):
     rb = ReplayBuffer(5)
     rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2])
     assert len(rb) == 3
     for i in range(8):
         rb.add(Transitions[i])
     assert len(rb) == 5
Ejemplo n.º 2
0
class PlayerTrainer(object):
    def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma):
        self.actor = actor
        self.critic = critic
        self.replay = ReplayBuffer(buffersize)
        self.game =game
        self.player = player
        self.batch_size = batch_size
        self.gamma = gamma


    def noisyMaxQMove(self):
        state = self.game.space
        As = self.actor.predict(np.reshape(state, (1, *state.shape)))
        avail = self.game.avail()
        availQ = {}
        availP = []
        for k in avail:
            availQ[k] = As[0][k]
            availP.append(As[0][k])
        # if sum(availP)> 0:
        availP = np.array(availP)

        availP = [round(i, 5) if i >= 0 else (-.001 * round(i, 5)) for i in availP]
        availNorm = [i / sum(availP) for i in availP]

        a = np.random.choice(avail, p=availNorm)

        self.game.move(a,self.player)
        next_state, reward = self.game.step(self.player)

        self.bufferAdd(state,As,reward,self.game.game_over,next_state)
        if self.replay.size() > self.batch_size:
            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay.sample_batch(self.batch_size)
            target_q = self.critic.predict_target(s2_batch,self.actor.predict_target(s2_batch))
            y_i = []
            for k in range(self.batch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.gamma * target_q[k])

            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.batch_size, 1)))

            #ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.actor.update_target_network()
        return self.game.space , reward

    def bufferAdd(self,state,Qs,reward,terminal,next_state):
        self.replay.add(np.reshape(state,(self.actor.s_dim,)),np.reshape(Qs,(self.actor.a_dim,)),reward,terminal,np.reshape(next_state,(self.actor.s_dim,)))
Ejemplo n.º 3
0
    def test_random_sampling(self):
        rb = ReplayBuffer(3)
        rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[1]).add(
            Transitions[2])

        samples = rb.sample(100)
        n_1, n_2 = 0, 0
        for sample in samples:
            if sample == Transitions[1]:
                n_1 += 1
            elif sample == Transitions[2]:
                n_2 += 1
            else:
                pytest.fail()

        assert n_1 > n_2
Ejemplo n.º 4
0
    def test_circular_buffer(self):
        rb = ReplayBuffer(4)
        rb.add(Transitions[0])
        rb.add(Transitions[1])
        rb.add(Transitions[2])
        rb.add(Transitions[3])
        rb.add(Transitions[4])
        rb.add(Transitions[5])

        assert (rb._storage == [
            Transitions[4], Transitions[5], Transitions[2], Transitions[3]
        ]).all()
Ejemplo n.º 5
0
class DQNAgent(object):
    """
    refs: https://github.com/skumar9876/Hierarchical-DQN/blob/master/dqn.py
    """
    def __init__(self,
                 states_n: tuple,
                 actions_n: int,
                 hidden_layers: list,
                 scope_name: str,
                 sess=None,
                 learning_rate=1e-4,
                 discount=0.98,
                 replay_memory_size=100000,
                 batch_size=32,
                 begin_train=1000,
                 targetnet_update_freq=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_step=50000,
                 seed=1,
                 logdir='logs',
                 savedir='save',
                 save_freq=10000,
                 use_tau=False,
                 tau=0.001):
        """

        :param states_n: tuple
        :param actions_n: int
        :param hidden_layers: list
        :param scope_name: str
        :param sess: tf.Session
        :param learning_rate: float
        :param discount: float
        :param replay_memory_size: int
        :param batch_size: int
        :param begin_train: int
        :param targetnet_update_freq: int
        :param epsilon_start: float
        :param epsilon_end: float
        :param epsilon_decay_step: int
        :param seed: int
        :param logdir: str
        """
        self.states_n = states_n
        self.actions_n = actions_n
        self._hidden_layers = hidden_layers
        self._scope_name = scope_name
        self.lr = learning_rate
        self._target_net_update_freq = targetnet_update_freq
        self._current_time_step = 0
        self._epsilon_schedule = LinearSchedule(epsilon_decay_step,
                                                epsilon_end, epsilon_start)
        self._train_batch_size = batch_size
        self._begin_train = begin_train
        self._gamma = discount

        self._use_tau = use_tau
        self._tau = tau

        self.savedir = savedir
        self.save_freq = save_freq

        self.qnet_optimizer = tf.train.AdamOptimizer(self.lr)

        self._replay_buffer = ReplayBuffer(replay_memory_size)

        self._seed(seed)

        with tf.Graph().as_default():
            self._build_graph()
            self._merged_summary = tf.summary.merge_all()

            if sess is None:
                self.sess = tf.Session()
            else:
                self.sess = sess
            self.sess.run(tf.global_variables_initializer())

            self._saver = tf.train.Saver()

            self._summary_writer = tf.summary.FileWriter(logdir=logdir)
            self._summary_writer.add_graph(tf.get_default_graph())

    def show_memory(self):
        print(self._replay_buffer.show())

    def _q_network(self, state, hidden_layers, outputs, scope_name, trainable):

        with tf.variable_scope(scope_name):
            out = state
            for ly in hidden_layers:
                out = layers.fully_connected(out,
                                             ly,
                                             activation_fn=tf.nn.relu,
                                             trainable=trainable)
            out = layers.fully_connected(out,
                                         outputs,
                                         activation_fn=None,
                                         trainable=trainable)
        return out

    def _build_graph(self):
        self._state = tf.placeholder(dtype=tf.float32,
                                     shape=(None, ) + self.states_n,
                                     name='state_input')

        with tf.variable_scope(self._scope_name):
            self._q_values = self._q_network(self._state, self._hidden_layers,
                                             self.actions_n, 'q_network', True)
            self._target_q_values = self._q_network(self._state,
                                                    self._hidden_layers,
                                                    self.actions_n,
                                                    'target_q_network', False)

        with tf.variable_scope('q_network_update'):
            self._actions_onehot = tf.placeholder(dtype=tf.float32,
                                                  shape=(None, self.actions_n),
                                                  name='actions_onehot_input')
            self._td_targets = tf.placeholder(dtype=tf.float32,
                                              shape=(None, ),
                                              name='td_targets')
            self._q_values_pred = tf.reduce_sum(self._q_values *
                                                self._actions_onehot,
                                                axis=1)

            self._error = tf.abs(self._q_values_pred - self._td_targets)
            quadratic_part = tf.clip_by_value(self._error, 0.0, 1.0)
            linear_part = self._error - quadratic_part
            self._loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) +
                                        linear_part)

            qnet_gradients = self.qnet_optimizer.compute_gradients(
                self._loss, tf.trainable_variables())
            for i, (grad, var) in enumerate(qnet_gradients):
                if grad is not None:
                    qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)
            self.train_op = self.qnet_optimizer.apply_gradients(qnet_gradients)

            tf.summary.scalar('loss', self._loss)

            with tf.name_scope('target_network_update'):
                q_network_params = [
                    t for t in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope=self._scope_name +
                                                 '/q_network')
                    if t.name.startswith(self._scope_name + '/q_network/')
                ]
                target_q_network_params = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope=self._scope_name + '/target_q_network')

                self.target_update_ops = []
                for var, var_target in zip(
                        sorted(q_network_params, key=lambda v: v.name),
                        sorted(target_q_network_params, key=lambda v: v.name)):
                    # self.target_update_ops.append(var_target.assign(var))

                    # soft target update
                    self.target_update_ops.append(
                        var_target.assign(
                            tf.multiply(var_target, 1 - self._tau) +
                            tf.multiply(var, self._tau)))
                self.target_update_ops = tf.group(*self.target_update_ops)

    def choose_action(self, state, epsilon=None):
        """
        for one agent
        :param state:
        :param epsilon:
        :return:
        """
        if epsilon is not None:
            epsilon_used = epsilon
        else:
            epsilon_used = self._epsilon_schedule.value(
                self._current_time_step)
        if np.random.random() < epsilon_used:
            return np.random.randint(0, self.actions_n)
        else:
            q_values = self.sess.run(self._q_values,
                                     feed_dict={self._state: state[None]})

            return np.argmax(q_values[0])

    def choose_actions(self, states, epsilons=None):
        """
        for multi-agent
        :param states:
        :param epsilon:
        :return:
        """
        if epsilons is not None:
            epsilons_used = epsilons
        else:
            epsilons_used = self._epsilon_schedule.value(
                self._current_time_step)

        actions = []
        for i, state in enumerate(states):
            if np.random.random() < epsilons_used[i]:
                actions.append(np.random.randint(0, self.actions_n))
            else:
                q_values = self.sess.run(self._q_values,
                                         feed_dict={self._state: state[None]})

                actions.append(np.argmax(q_values[0]))

        return actions

    def check_network_output(self, state):
        q_values = self.sess.run(self._q_values,
                                 feed_dict={self._state: state[None]})
        print(q_values[0])

    def store(self, state, action, reward, next_state, terminate):
        self._replay_buffer.add(state, action, reward, next_state, terminate)

    def get_max_target_Q_s_a(self, next_states):
        next_state_q_values = self.sess.run(
            self._q_values, feed_dict={self._state: next_states})
        next_state_target_q_values = self.sess.run(
            self._target_q_values, feed_dict={self._state: next_states})

        next_select_actions = np.argmax(next_state_q_values, axis=1)
        bt_sz = len(next_states)
        next_select_actions_onehot = np.zeros((bt_sz, self.actions_n))
        for i in range(bt_sz):
            next_select_actions_onehot[i, next_select_actions[i]] = 1.

        next_state_max_q_values = np.sum(next_state_target_q_values *
                                         next_select_actions_onehot,
                                         axis=1)
        return next_state_max_q_values

    def train(self):

        self._current_time_step += 1

        if self._current_time_step == 1:
            print('Training starts.')
            self.sess.run(self.target_update_ops)

        if self._current_time_step > self._begin_train:
            states, actions, rewards, next_states, terminates = self._replay_buffer.sample(
                batch_size=self._train_batch_size)

            actions_onehot = np.zeros((self._train_batch_size, self.actions_n))
            for i in range(self._train_batch_size):
                actions_onehot[i, actions[i]] = 1.

            next_state_q_values = self.sess.run(
                self._q_values, feed_dict={self._state: next_states})
            next_state_target_q_values = self.sess.run(
                self._target_q_values, feed_dict={self._state: next_states})

            next_select_actions = np.argmax(next_state_q_values, axis=1)
            next_select_actions_onehot = np.zeros(
                (self._train_batch_size, self.actions_n))
            for i in range(self._train_batch_size):
                next_select_actions_onehot[i, next_select_actions[i]] = 1.

            next_state_max_q_values = np.sum(next_state_target_q_values *
                                             next_select_actions_onehot,
                                             axis=1)

            td_targets = rewards + self._gamma * next_state_max_q_values * (
                1 - terminates)

            _, str_ = self.sess.run(
                [self.train_op, self._merged_summary],
                feed_dict={
                    self._state: states,
                    self._actions_onehot: actions_onehot,
                    self._td_targets: td_targets
                })

            self._summary_writer.add_summary(str_, self._current_time_step)

        # update target_net
        if self._use_tau:
            self.sess.run(self.target_update_ops)
        else:
            if self._current_time_step % self._target_net_update_freq == 0:
                self.sess.run(self.target_update_ops)

        # save model
        if self._current_time_step % self.save_freq == 0:

            # TODO save the model with highest performance
            self._saver.save(sess=self.sess,
                             save_path=self.savedir + '/my-model',
                             global_step=self._current_time_step)

    def train_without_replaybuffer(self, states, actions, target_values):

        self._current_time_step += 1

        if self._current_time_step == 1:
            print('Training starts.')
            self.sess.run(self.target_update_ops)

        bt_sz = len(states)
        actions_onehot = np.zeros((bt_sz, self.actions_n))
        for i in range(bt_sz):
            actions_onehot[i, actions[i]] = 1.

        _, str_ = self.sess.run(
            [self.train_op, self._merged_summary],
            feed_dict={
                self._state: states,
                self._actions_onehot: actions_onehot,
                self._td_targets: target_values
            })

        self._summary_writer.add_summary(str_, self._current_time_step)

        # update target_net
        if self._use_tau:
            self.sess.run(self.target_update_ops)
        else:
            if self._current_time_step % self._target_net_update_freq == 0:
                self.sess.run(self.target_update_ops)

        # save model
        if self._current_time_step % self.save_freq == 0:

            # TODO save the model with highest performance
            self._saver.save(sess=self.sess,
                             save_path=self.savedir + '/my-model',
                             global_step=self._current_time_step)

    def load_model(self):
        self._saver.restore(self.sess,
                            tf.train.latest_checkpoint(self.savedir))

    def _seed(self, lucky_number):
        tf.set_random_seed(lucky_number)
        np.random.seed(lucky_number)
        random.seed(lucky_number)
Ejemplo n.º 6
0
class DDPGLearner():
    def __init__(self, input_space, act_space, scope, args):
        self.input_shape = input_space
        self.act_space = act_space
        self.scope = scope
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        self.grad_norm_clipping = 0.5
        with tf.variable_scope(self.scope):
            act_pdtype = make_pdtype(act_space)

            # act_ph = act_pdtype.sample_placeholder([None], name= "action")
            act_ph = tf.placeholder(tf.float32, shape=(None, 1))
            if args.game == "RoboschoolPong-v1":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0]))
            elif args.game == "Pong-2p-v0":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0],
                                               input_space.shape[1],
                                               input_space.shape[2]))
            q_target = tf.placeholder(tf.float32, shape=(None, ))

            #build the world representation z
            z = conv_model(obs_ph, 20, scope="world_model")
            p_input = z

            p = mlp_model(p_input, 2, scope="p_func")
            p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

            act_pd = act_pdtype.pdfromflat(p)
            act_sample = act_pd.sample()

            p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

            q_input = tf.concat([z, act_sample], -1)
            q = mlp_model(q_input, 1, scope="q_func")
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            pg_loss = -tf.reduce_mean(q)

            q_loss = tf.reduce_mean(tf.square(q - q_target))
            # q_reg = tf.reduce_mean(tf.square(q))
            q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss,
                                                  q_func_vars,
                                                  self.grad_norm_clipping)

            p_loss = pg_loss + p_reg * 1e-3

            p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss,
                                                  p_func_vars,
                                                  self.grad_norm_clipping)

            p_values = U.function([obs_ph], p)

            target_p = mlp_model(z, 2, scope="target_p_func")
            target_p_func_vars = U.scope_vars(
                U.absolute_scope_name("target_p_func"))

            target_q = mlp_model(q_input, 1, scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))
            target_act_sample = act_pdtype.pdfromflat(target_p).sample()

            self.update_target_p = make_update_exp(p_func_vars,
                                                   target_p_func_vars)
            self.update_target_q = make_update_exp(q_func_vars,
                                                   target_q_func_vars)

            self.act = U.function(inputs=[obs_ph], outputs=act_sample)
            self.target_act = U.function(inputs=[obs_ph],
                                         outputs=target_act_sample)
            self.p_train = U.function(inputs=[obs_ph] + [act_ph],
                                      outputs=p_loss,
                                      updates=[p_optimize_expr])
            self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target],
                                      outputs=q_loss,
                                      updates=[q_optimize_expr])
            self.q_values = U.function([obs_ph] + [act_ph], q)
            self.target_q_values = U.function([obs_ph] + [act_ph], target_q)

    def get_act(self, obs):
        return self.act(*([obs]))[0]

    def experience(self, obs, act, rew, new_obs, done):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def learn(self, batch_size, gamma):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return 0, 0
        self.replay_sample_index = self.replay_buffer.make_index(batch_size)
        # collect replay sample from all agents
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(
            self.replay_sample_index)
        # train q network
        target_q = [0.] * batch_size
        target_act_next = self.target_act(obs_next)
        target_q_next = self.target_q_values(*([obs_next] + [target_act_next]))
        for i in range(batch_size):
            target_q[i] += rew[i] + gamma * (1.0 - done[i]) * target_q_next[i]
        target_q = np.squeeze(target_q, axis=1)
        q_loss = self.q_train(*([obs] + [act] + [target_q]))
        p_loss = self.p_train(*([obs] + [act]))

        self.update_target_p()
        self.update_target_q()
        return q_loss, p_loss

    def reset_replay_buffer(self):
        self.replay_buffer = ReplayBuffer(1e6)
Ejemplo n.º 7
0
def main(env_name='KungFuMasterNoFrameskip-v0',
         train_freq=4,
         target_update_freq=10000,
         checkpoint_freq=100000,
         log_freq=1,
         batch_size=32,
         train_after=200000,
         max_timesteps=5000000,
         buffer_size=50000,
         vmin=-10,
         vmax=10,
         n=51,
         gamma=0.99,
         final_eps=0.1,
         final_eps_update=1000000,
         learning_rate=0.00025,
         momentum=0.95):
    env = gym.make(env_name)
    env = wrap_env(env)

    state_dim = (4, 84, 84)
    action_count = env.action_space.n

    with C.default_options(activation=C.relu, init=C.he_uniform()):
        model_func = Sequential([
            Convolution2D((8, 8), 32, strides=4, name='conv1'),
            Convolution2D((4, 4), 64, strides=2, name='conv2'),
            Convolution2D((3, 3), 64, strides=1, name='conv3'),
            Dense(512, name='dense1'),
            Dense((action_count, n), activation=None, name='out')
        ])

    agent = CategoricalAgent(state_dim, action_count, model_func, vmin, vmax, n, gamma,
                             lr=learning_rate, mm=momentum, use_tensorboard=True)
    logger = agent.writer

    epsilon_schedule = LinearSchedule(1.0, final_eps, final_eps_update)
    replay_buffer = ReplayBuffer(buffer_size)

    try:
        obs = env.reset()
        episode = 0
        rewards = 0
        steps = 0

        for t in range(max_timesteps):
            # Take action
            if t > train_after:
                action = agent.act(obs, epsilon=epsilon_schedule.value(t))
            else:
                action = np.random.choice(action_count)
            obs_, reward, done, _ = env.step(action)

            # Store transition in replay buffer
            replay_buffer.add(obs, action, reward, obs_, float(done))

            obs = obs_
            rewards += reward

            if t > train_after and (t % train_freq) == 0:
                # Minimize error in projected Bellman update on a batch sampled from replay buffer
                experience = replay_buffer.sample(batch_size)
                agent.train(*experience)  # experience is (s, a, r, s_, t) tuple
                logger.write_value('loss', agent.trainer.previous_minibatch_loss_average, t)

            if t > train_after and (t % target_update_freq) == 0:
                agent.update_target()

            if t > train_after and (t % checkpoint_freq) == 0:
                agent.checkpoint('checkpoints/model_{}.chkpt'.format(t))

            if done:
                episode += 1
                obs = env.reset()

                if episode % log_freq == 0:
                    steps = t - steps + 1

                    logger.write_value('rewards', rewards, episode)
                    logger.write_value('steps', steps, episode)
                    logger.write_value('epsilon', epsilon_schedule.value(t), episode)
                    logger.flush()

                rewards = 0
                steps = t

    finally:
        agent.save_model('checkpoints/{}.cdqn'.format(env_name))
Ejemplo n.º 8
0
def main():
    env = MultiEnvRunnerWrapper(ENV_NUM, CMOTP)

    lucky_no = RANDOM_SEED
    set_seed(lucky_no)

    agent1 = LenientDQNAgent(env.envs[0], ENV_NUM, [256, 256], 'LenientAgent1',
                             learning_rate=1e-4,
                             use_tau=True, tau=1e-3,
                             mu=MAX_U,
                             logdir='logs/logs1_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES),
                             savedir='save/save1_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES),
                             auto_save=False, discount=GAMMA)

    agent2 = LenientDQNAgent(env.envs[0], ENV_NUM, [256, 256], 'LenientAgent2',
                             learning_rate=1e-4,
                             use_tau=True, tau=1e-3,
                             mu=MAX_U,
                             logdir='logs/logs2_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES),
                             savedir='save/save2_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES),
                             auto_save=False, discount=GAMMA)
    erm1 = ReplayBuffer(ERM_FACTOR * ENV_NUM * STEP_N)
    erm2 = ReplayBuffer(ERM_FACTOR * ENV_NUM * STEP_N)

    print('after init')
    begintime = time.time()

    if TRAIN:

        train_input_shape = (ENV_NUM * STEP_N,) + env.envs[0].observation_space.shape

        episodes_1 = [[] for _ in range(ENV_NUM)]
        episodes_2 = [[] for _ in range(ENV_NUM)]
        states_1, states_2 = env.reset()

        ep_cnt = 0

        ep_len_log = []
        min_len = 10000.

        train_num = 0
        train_log = []

        # 针对某一状态,记录所有环境中,该状态的温度值
        temp_log = [[] for _ in range(ENV_NUM)]

        while len(ep_len_log) < TRAIN_EPISODES:

            sts_1 = [[] for _ in range(ENV_NUM)]
            acts_1 = [[] for _ in range(ENV_NUM)]
            rwds_1 = [[] for _ in range(ENV_NUM)]
            n_sts_1 = [[] for _ in range(ENV_NUM)]
            dns_1 = [[] for _ in range(ENV_NUM)]
            ln_1 = [[] for _ in range(ENV_NUM)]

            sts_2 = [[] for _ in range(ENV_NUM)]
            acts_2 = [[] for _ in range(ENV_NUM)]
            rwds_2 = [[] for _ in range(ENV_NUM)]
            n_sts_2 = [[] for _ in range(ENV_NUM)]
            dns_2 = [[] for _ in range(ENV_NUM)]
            ln_2 = [[] for _ in range(ENV_NUM)]

            # get a batch of train data
            for j in range(ENV_NUM):
                for k in range(STEP_N):
                    action_1 = agent1.choose_action(states_1[j], j)
                    action_2 = agent2.choose_action(states_2[j], j)
                    action_n = [action_1, action_2]
                    next_state, reward, done, _ = env.envs[j].step([action_1, action_2])
                    next_state_1, next_state_2 = next_state
                    reward_1, reward_2 = reward
                    done_1, done_2 = done

                    episodes_1[j].append((states_1[j], action_1))
                    episodes_2[j].append((states_2[j], action_2))

                    sts_1[j].append(states_1[j])
                    acts_1[j].append(action_1)
                    rwds_1[j].append(reward_1)
                    n_sts_1[j].append(next_state_1)
                    dns_1[j].append(done_1)
                    ln_1[j].append(
                        agent1.leniency_calculator.calc_leniency(agent1.temp_recorders[j].get_state_temp(states_1[j])))

                    sts_2[j].append(states_2[j])
                    acts_2[j].append(action_2)
                    rwds_2[j].append(reward_2)
                    n_sts_2[j].append(next_state_2)
                    dns_2[j].append(done_2)
                    ln_2[j].append(
                        agent2.leniency_calculator.calc_leniency(agent2.temp_recorders[j].get_state_temp(states_2[j])))

                    states_1[j] = next_state_1
                    states_2[j] = next_state_2

                    if done_1:
                        states_1[j], states_2[j] = env.envs[j].reset()
                        agent1.temp_recorders[j].decay_temp(episodes_1[j])
                        agent2.temp_recorders[j].decay_temp(episodes_2[j])

                        ep_cnt += 1

                        this_train_log = (train_num, ep_cnt, j,
                                          agent1.temp_recorders[j].get_ave_temp(),
                                          agent1.temp_recorders[j].get_temp_len(),
                                          len(episodes_1[j]))
                        train_log.append(this_train_log)

                        print('train_num: {}, episode_cnt: {}, env: {} , mean_temp: {}, temp_len: {}, len: {} '.format(
                            *this_train_log))
                        checked_temp = agent1.temp_recorders[j].show_temp(big=True, narrow=False)
                        temp_log[j].append(checked_temp)

                        if ep_cnt % 100 == 0:
                            print('testing...')
                            print('average episode length: ', test(agent1, agent2, render=False, load_model=False))

                        ep_len_log.append(len(episodes_1[j]))
                        tmp = np.mean(ep_len_log[-10:])
                        if tmp < min_len:
                            print('update min_len with ', tmp)
                            min_len = tmp
                            agent1.save_model()
                            agent2.save_model()

                        episodes_1[j] = []
                        episodes_2[j] = []

            # discount reward
            last_values_1 = agent1.get_max_target_Q_s_a(states_1)
            for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds_1, dns_1, last_values_1)):
                if type(rwd_j) is np.ndarray:
                    rwd_j = rwd_j.tolist()
                if type(dn_j) is np.ndarray:
                    dn_j = dn_j.tolist()

                if dn_j[-1] == 0:
                    rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1]
                else:
                    rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA)

                rwds_1[j] = rwd_j

            last_values_2 = agent2.get_max_target_Q_s_a(states_2)
            for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds_2, dns_2, last_values_2)):
                if type(rwd_j) is np.ndarray:
                    rwd_j = rwd_j.tolist()
                if type(dn_j) is np.ndarray:
                    dn_j = dn_j.tolist()

                if dn_j[-1] == 0:
                    rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1]
                else:
                    rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA)

                rwds_2[j] = rwd_j

            # flatten
            sts_1 = np.asarray(sts_1, dtype=np.float32).reshape(train_input_shape)
            acts_1 = np.asarray(acts_1, dtype=np.int32).flatten()
            rwds_1 = np.asarray(rwds_1, dtype=np.float32).flatten()
            n_sts_1 = np.asarray(n_sts_1, dtype=np.float32).reshape(train_input_shape)
            dns_1 = np.asarray(dns_1, dtype=np.bool).flatten()
            ln_1 = np.asarray(ln_1, dtype=np.float32).flatten()

            sts_2 = np.asarray(sts_2, dtype=np.float32).reshape(train_input_shape)
            acts_2 = np.asarray(acts_2, dtype=np.int32).flatten()
            rwds_2 = np.asarray(rwds_2, dtype=np.float32).flatten()
            n_sts_2 = np.asarray(n_sts_2, dtype=np.float32).reshape(train_input_shape)
            dns_2 = np.asarray(dns_2, dtype=np.bool).flatten()
            ln_2 = np.asarray(ln_2, dtype=np.float32).flatten()

            # train
            agent1.train_without_replaybuffer(sts_1, acts_1, rwds_1, ln_1)
            agent2.train_without_replaybuffer(sts_2, acts_2, rwds_2, ln_2)
            train_num += 1


            # store these transitions to ERM
            for ii, (s1, a1, td1, l1) in enumerate(zip(sts_1, acts_1, rwds_1, ln_1)):
                erm1.add(s1, a1, td1, [], l1)
            for ii, (s2, a2, td2, l2) in enumerate(zip(sts_2, acts_2, rwds_2, ln_2)):
                erm2.add(s2, a2, td2, [], l2)

            # print(sts_1)
            # print(acts_1)
            # print(rwds_1)
            # print(ln_1)
            # print('----------------------')
            # erm1.show()
            # exit()

            # train with transitions from ERM
            for ii in range(ERM_TRAIN_NUM):
                erm_s1, erm_a1, erm_td1, _, erm_l1 = erm1.sample(ENV_NUM * STEP_N)
                erm_s2, erm_a2, erm_td2, _, erm_l2 = erm2.sample(ENV_NUM * STEP_N)
                # print('*************************')
                # print(erm_s1)
                # print(erm_a1)
                # print(erm_td1)
                # print(erm_l1)
                # exit()
                agent1.train_without_replaybuffer(erm_s1, erm_a1, erm_td1, erm_l1)
                agent2.train_without_replaybuffer(erm_s2, erm_a2, erm_td2, erm_l2)
                train_num += 1


        endtime = time.time()
        print('training time: {}'.format(endtime - begintime))

        with open('./train_log.txt', 'a') as f:
            f.write('ERM num_env: {}, n_step: {}, rand_seed: {}, episodes: {}, training time: {}'.format(
                ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES, endtime - begintime) + '\n')

        # np.save('ep_len_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no), ep_len_log)

        train_log = np.array(train_log)
        np.save('train_log_ERM_{}_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no, TRAIN_TIMES), train_log)

        temp_log = np.array(temp_log)
        np.save('temp_log_ERM_{}_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no, TRAIN_TIMES), temp_log)

    else:
        test(agent1, agent2, render=True, load_model=True)

    env.close()
Ejemplo n.º 9
0
def main():
    with open('cartpole.json', encoding='utf-8') as config_file:
        config = json.load(config_file)

    env = gym.make('CartPole-v0')
    state_shape = env.observation_space.shape
    action_count = env.action_space.n

    layers = []
    for layer in config['layers']:
        layers.append(Dense(layer, activation=C.relu))

    layers.append(Dense((action_count, config['n']), activation=None))
    model_func = Sequential(layers)

    replay_buffer = ReplayBuffer(config['buffer_capacity'])

    # Fill the buffer with randomly generated samples
    state = env.reset()
    for i in range(config['buffer_capacity']):
        action = env.action_space.sample()
        post_state, reward, done, _ = env.step(action)
        replay_buffer.add(state.astype(np.float32), action, reward, post_state.astype(np.float32), float(done))

        if done:
            state = env.reset()

    reward_buffer = np.zeros(config['max_episodes'], dtype=np.float32)
    losses = []

    epsilon_schedule = LinearSchedule(1, 0.01, config['max_episodes'])
    agent = CategoricalAgent(state_shape, action_count, model_func, config['vmin'], config['vmax'], config['n'],
                             lr=config['lr'], gamma=config['gamma'])

    log_freq = config['log_freq']
    for episode in range(1, config['max_episodes'] + 1):
        state = env.reset().astype(np.float32)
        done = False

        while not done:
            action = agent.act(state, epsilon_schedule.value(episode))
            post_state, reward, done, _ = env.step(action)

            post_state = post_state.astype(np.float32)
            replay_buffer.add(state, action, reward, post_state, float(done))
            reward_buffer[episode - 1] += reward

            state = post_state

        minibatch = replay_buffer.sample(config['minibatch_size'])
        agent.train(*minibatch)
        loss = agent.trainer.previous_minibatch_loss_average
        losses.append(loss)

        if episode % config['target_update_freq'] == 0:
            agent.update_target()

        if episode % log_freq == 0:
            average = np.sum(reward_buffer[episode - log_freq: episode]) / log_freq
            print('Episode {:4d} | Loss: {:6.4f} | Reward: {}'.format(episode, loss, average))

    agent.model.save('cartpole.cdqn')

    sns.set_style('dark')
    pd.Series(reward_buffer).rolling(window=log_freq).mean().plot()
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('CartPole - Reward with Time')
    plt.show()

    plt.plot(np.arange(len(losses)), losses)
    plt.xlabel('Episode')
    plt.ylabel('Loss')
    plt.title('CartPole - Loss with Time')
    plt.show()