コード例 #1
0
class DDPG(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 gamma=0.99,
                 tau=0.01,
                 actor_lr=0.0005,
                 critic_lr=0.001,
                 noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=10,
                 logger=None,
                 checkpoint_queen=None):
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.pointer = 0
        self.buffer_size = buffer_size
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=gamma,
                           lr=actor_lr,
                           tau=tau,
                           l2_reg=0)
        self.critic = Critic(self.n_state,
                             self.n_action,
                             gamma=gamma,
                             lr=critic_lr,
                             tau=tau,
                             l2_reg=0)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen

        self.prefix = self.__class__.__name__.lower()

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def policy_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  # asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i]
        return q_target

    def update_model(self, states, actions, q_values):
        # train critic
        loss_names, loss_values = self.critic.train_on_batch(
            states, actions, q_values)

        # train actor
        # p_actions = self.actor.predict(states)  #actions with no noise
        grad_ys = self.critic.gradients(
            states, self.actor.predict(states))  #(batch, n-action)
        actor_output = self.actor.train(states, self.actor.predict(states),
                                        grad_ys)

        # copy network
        self.actor.copy_weights()
        self.critic.copy_weights()

        # print(grad_ys, grad_ys.shape)
        # print(actor_output[0],actor_output[0].shape)
        # print(np.mean(grad_ys*actor_output[0]))

        return loss_names, loss_values, grad_ys, actor_output

    def save_weights(self, path):
        self.actor.save(path)
        self.critic.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic.model.save(
            os.path.join(path, self.prefix + '_critic_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4f}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            critic = os.path.join(
                path, self.prefix + '_critic_' + to_delete[1] + '.h5')
            os.remove(actor)
            os.remove(critic)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit=" epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, args.batch_size, shuffle=True)
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.policy_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1])
            # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            r = np.where(labels == 1, llr.ravel(), -llr.ravel())  #(batch,)
            # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states))
            q_ = self.bellman_q_value(rewards=r,
                                      q_nexts=0,
                                      dones=[True] * r.shape[0])  #(batch,)
            loss_names, loss_values, grad_ys, actor_output = self.update_model(
                states, a, q_.reshape(-1, 1))

            score = r.mean()

            if ((e + 1) % self.noise_decay_steps - 1) == 0:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

                summary_writer.add_summary(tf_summary(['mean-reward'],
                                                      [score]),
                                           global_step=e)
                summary_writer.add_summary(tf_summary(loss_names,
                                                      [loss_values]),
                                           global_step=e)
                merge = keras.backend.get_session().run(
                    self.merge,
                    feed_dict={
                        self.critic.model.input[0]: states,
                        self.critic.model.input[1]: a,
                        self.actor.model.input: states
                    })
                summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)
            # print(grad_ys,grad_ys.shape)
            # print(actor_output)
            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std),
                               max_val_rate='{:.4f}'.format(max_val_rate),
                               val_rate='{:.4f}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results
コード例 #2
0
ファイル: TD3.py プロジェクト: sameul-yuan/deep_learning_work
class TD3(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 discount=0.99,
                 tau=0.05,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 policy_freq=2,
                 exp_noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 smooth_noise_std=0.1,
                 clip=0.2,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=20,
                 logger=None,
                 checkpoint_queen=None):
        #self.__dict__.update(locals())
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = exp_noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.policy_freq = policy_freq
        self.smooth_noise_std = smooth_noise_std
        self.clip = clip
        self.discount = discount

        self.pointer = 0
        self.buffer = MemoryBuffer(buffer_size, with_per=True)
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=discount,
                           lr=actor_lr,
                           tau=tau)
        self.critic1 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.critic2 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen
        self.prefix = self.__class__.__name__

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic1.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic1.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic1.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def select_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  #asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.discount * q_nexts[i]
        return q_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        if (self.buffer.with_per):
            q_val = reward
            q_val_t = self.critic1.target_predict(state, action)
            td_error = abs(q_val_t - q_val)[0]
            # print(td_error)
        else:
            td_error = 0
        state = state.reshape(-1)
        action = action.reshape(-1)
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_actor(self, states):
        actions = self.actor.predict(states)
        grad_ys = self.critic1.gradients(states, actions)
        actor_output = self.actor.train(states, actions, grad_ys)
        self.actor.copy_weights()
        self.critic1.copy_weights()
        self.critic2.copy_weights()
        return grad_ys, actor_output

    def update_critic(self, states, actions, q_values):
        loss_names, loss_values = self.critic1.train_on_batch(
            states, actions, q_values)
        self.critic2.train_on_batch(states, actions, q_values)
        return loss_names, loss_values

    def save_weights(self, path):
        self.actor.save(path)
        self.critic1.save(path)
        self.critic2.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic1.model.save(
            os.path.join(path, self.prefix + '_critic1_' + file + '.h5'))
        self.critic2.model.save(
            os.path.join(path, self.prefix + '_critic2_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            delete_actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            delete_critic1 = os.path.join(
                path, self.prefix + '_critic1_' + to_delete[1] + '.h5')
            delete_critic2 = os.path.join(
                path, self.prefix + '_critic2_' + to_delete[1] + '.h5')
            os.remove(delete_actor)
            os.remove(delete_critic1)
            os.remove(delete_critic2)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit="epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, 1, shuffle=True)

        warm_up = 20 * args.batch_size
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.select_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            # rewards = np.where(labels==1, llr.ravel(), -llr.ravel())  #(batch,)
            rewards = np.where(labels == 1,
                               np.where(llr > 0, llr.ravel(), 2 * llr.ravel()),
                               np.where(llr < 0, -llr.ravel(),
                                        -2 * llr.ravel()))  #(batch,)
            # print(rewards)

            # a_ = self.actor.target_predict(next_states)
            # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip)
            # a_ = a_ + noise
            # q_next1 = self.critic1.target_predict(new_states, a_)
            # q_next2 = self.critic2.target_predict(new_states,a_)
            # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2)
            self.memorize(states, a, rewards, True, None)
            if e < warm_up:
                continue

            states, a, rewards, _, _, _ = self.sample_batch(args.batch_size)
            # print(states.shape, a.shape, rewards.shape)

            q_ = self.bellman_q_value(rewards=rewards,
                                      q_nexts=0,
                                      dones=[True] *
                                      rewards.shape[0])  #(batch,)

            loss_names, loss_values = self.update_critic(
                states, a, q_.reshape(-1, 1))

            if e % self.policy_freq == 0 or e == warm_up:
                grad_ys, actor_output = self.update_actor(states)

            if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    # print(val_pred.shape,val_pred[:10])
                    # print(val_y.shape, val_y[:10])
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

            score = rewards.mean()
            summary_writer.add_summary(tf_summary(['mean-reward'], [score]),
                                       global_step=e)
            summary_writer.add_summary(tf_summary(loss_names, [loss_values]),
                                       global_step=e)
            merge = keras.backend.get_session().run(
                self.merge,
                feed_dict={
                    self.critic1.model.input[0]: states,
                    self.critic1.model.input[1]: a,
                    self.actor.model.input: states
                })
            summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)

            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward',
                                    '%.4f+%.4f' % (score, rewards.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std),
                               max_val_rate='{:.4}'.format(max_val_rate),
                               val_rate='{:.4}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results