コード例 #1
0
 def learn(self, **kwargs):
     episode = kwargs['episode']
     self.calculate_statistics()
     for _ in range(self.epoch):
         s, visual_s, a, dc_r = self.get_sample_data()
         summaries, _ = self.sess.run([self.summaries, self.train_op], feed_dict={
             self.pl_visual_s: visual_s,
             self.pl_s: s,
             self.pl_a: a if self.action_type == 'continuous' else sth.action_index2one_hot(a, self.a_dim_or_list),
             self.dc_r: dc_r,
             self.episode: episode,
             self.sigma_offset: np.full(self.a_counts, 0.01)
         })
         self.recorder.writer.add_summary(summaries, self.sess.run(self.global_step))
     self.recorder.writer_summary(
         x=episode,
         ys=[{
             'tag': 'REWARD/discounted_reward',
             'value': self.data.discounted_reward.values[0].mean()
         },
             {
             'tag': 'REWARD/reward',
             'value': self.data.total_reward.values[0].mean()
         }])
     self.clear()
コード例 #2
0
ファイル: trpo.py プロジェクト: Abluceli/RLs
 def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
     assert isinstance(
         a, np.ndarray), "store_data need action type is np.ndarray"
     assert isinstance(
         r, np.ndarray), "store_data need reward type is np.ndarray"
     assert isinstance(
         done, np.ndarray), "store_data need done type is np.ndarray"
     if not self.is_continuous:
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     _data = {
         's': s,
         'visual_s': visual_s,
         'a': a,
         'r': r,
         'done': done,
         'value': np.squeeze(self._get_value(s, visual_s).numpy()),
         'log_prob': self._get_log_prob(s, visual_s, a).numpy() + 1e-10
     }
     if self.is_continuous:
         _data.update({'old_mu': self.actor_net(s, visual_s).numpy()})
         _data.update({'old_log_std': self.log_std.numpy()})
     else:
         _data.update({
             'old_logp_all':
             tf.nn.log_softmax(self.actor_net(s, visual_s)).numpy()
         })
     self.data = self.data.append(_data, ignore_index=True)
     self.s_ = s_
     self.visual_s_ = visual_s_
コード例 #3
0
ファイル: ac.py プロジェクト: hititan/RLs
 def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
     assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray"
     old_log_prob = np.ones_like(r)
     if not self.is_continuous:
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     self.data.add(s, visual_s, a, old_log_prob[:, np.newaxis], r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis])
コード例 #4
0
ファイル: ppo.py プロジェクト: familywei/RLs
    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"

        self.data = self.data.append(
            {
                's':
                s,
                'visual_s':
                visual_s,
                'a':
                a,
                'r':
                r,
                's_':
                s_,
                'visual_s_':
                visual_s_,
                'done':
                done,
                'value':
                np.squeeze(
                    self.sess.run(
                        self.value,
                        feed_dict={
                            self.pl_visual_s: visual_s,
                            self.pl_s: s,
                            self.sigma_offset: np.full(self.a_counts, 0.01)
                        })),
                'next_value':
                np.squeeze(
                    self.sess.run(
                        self.value,
                        feed_dict={
                            self.pl_visual_s: visual_s_,
                            self.pl_s: s_,
                            self.sigma_offset: np.full(self.a_counts, 0.01)
                        })),
                'prob':
                self.sess.run(
                    self.new_prob,
                    feed_dict={
                        self.pl_visual_s:
                        visual_s,
                        self.pl_s:
                        s,
                        self.pl_a:
                        a if self.action_type == 'continuous' else
                        sth.action_index2one_hot(a, self.a_dim_or_list),
                        self.sigma_offset:
                        np.full(self.a_counts, 0.01)
                    }) + 1e-10
            },
            ignore_index=True)
コード例 #5
0
ファイル: ac.py プロジェクト: hititan/RLs
 def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
     assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray"
     if not self.is_continuous:
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     old_log_prob = self._get_log_prob(s, visual_s, a).numpy()
     self.data.add(s, visual_s, a, old_log_prob,
                   r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis])
コード例 #6
0
 def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
     old_prob = self.sess.run(self.prob, feed_dict={
         self.pl_visual_s: visual_s,
         self.pl_s: s,
         self.pl_a: a if self.action_type == 'continuous' else sth.action_index2one_hot(a, self.a_dim_or_list),
         self.sigma_offset: np.full(self.a_counts, 0.01)
     })
     assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray"
     self.data.add(s, visual_s, a, old_prob, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis])
コード例 #7
0
ファイル: ddqn.py プロジェクト: familywei/RLs
 def learn(self, episode):
     if self.data.is_lg_batch_size:
         s, visual_s, a, r, s_, visual_s_, done = self.data.sample()
         _a = sth.action_index2one_hot(a, self.a_dim_or_list)
         self.global_step.assign_add(1)
         q_loss = self.train(s, visual_s, _a, r, s_, visual_s_, done)
         if self.global_step % self.assign_interval == 0:
             self.update_target_net_weights(self.q_target_net.weights,
                                            self.q_net.weights)
         tf.summary.experimental.set_step(self.global_step)
         tf.summary.scalar('LOSS/loss', tf.reduce_mean(q_loss))
         tf.summary.scalar('LEARNING_RATE/lr', tf.reduce_mean(self.lr))
         self.recorder.writer.flush()
コード例 #8
0
ファイル: off_policy.py プロジェクト: Abluceli/HRG-SAC
    def no_op_store_gcn(self, adj, x, visual_s, a, r, adj_, x_, visual_s_,
                        done):
        assert isinstance(
            a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "no_op_store need done type is np.ndarray"

        if not self.is_continuous:
            a = sth.action_index2one_hot(a, self.a_dim_or_list)

        self.data.add(adj, x, visual_s, a, r[:, np.newaxis], adj_, x_,
                      visual_s_, done[:, np.newaxis])
コード例 #9
0
 def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
     """
     for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
     """
     assert isinstance(a,
                       np.ndarray), "store need action type is np.ndarray"
     assert isinstance(r,
                       np.ndarray), "store need reward type is np.ndarray"
     assert isinstance(done,
                       np.ndarray), "store need done type is np.ndarray"
     if not self.is_continuous:
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                   done[:, np.newaxis])
コード例 #10
0
ファイル: a2c.py プロジェクト: familywei/RLs
 def learn(self, episode):
     self.calculate_statistics()
     s, visual_s, a, dc_r = self.get_sample_data()
     self.global_step.assign_add(1)
     if not self.action_type == 'continuous':
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     actor_loss, critic_loss, entropy = self.train(s, visual_s, a, dc_r)
     tf.summary.experimental.set_step(self.global_step)
     if entropy is not None:
         tf.summary.scalar('LOSS/entropy', entropy)
     tf.summary.scalar('LOSS/actor_loss', actor_loss)
     tf.summary.scalar('LOSS/critic_loss', critic_loss)
     tf.summary.scalar('LEARNING_RATE/lr', self.lr)
     self.recorder.writer.flush()
     self.clear()
コード例 #11
0
ファイル: a2c.py プロジェクト: kasimte/RLs
 def learn(self, **kwargs):
     episode = kwargs['episode']
     self.calculate_statistics()
     for _ in range(self.epoch):
         s, visual_s, a, dc_r = self.get_sample_data()
         summaries, _ = self.sess.run([self.summaries, self.train_sequence], feed_dict={
             self.pl_visual_s: visual_s,
             self.pl_s: s,
             self.pl_a: a if self.action_type == 'continuous' else sth.action_index2one_hot(a, self.a_dim_or_list),
             self.dc_r: dc_r,
             self.episode: episode,
             self.sigma_offset: np.full(self.a_counts, 0.01)
         })
         self.recorder.writer.add_summary(summaries, self.sess.run(self.global_step))
     self.clear()
コード例 #12
0
ファイル: ddqn.py プロジェクト: familywei/RLs
 def learn(self, episode):
     s, visual_s, a, r, s_, visual_s_, done = self.data.sample()
     _a = sth.action_index2one_hot(a, self.a_dim_or_list)
     summaries, _ = self.sess.run([self.summaries, self.train_q], feed_dict={
         self.pl_visual_s: visual_s,
         self.pl_s: s,
         self.pl_a: _a,
         self.pl_r: r,
         self.pl_visual_s_: visual_s_,
         self.pl_s_: s_,
         self.pl_done: done,
         self.episode: episode
     })
     if self.sess.run(self.global_step) % self.assign_interval == 0:
         self.sess.run(self.assign_q_target)
     self.recorder.writer.add_summary(summaries, self.sess.run(self.global_step))
コード例 #13
0
 def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
     assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray"
     if self.policy_mode == 'OFF':
         if not self.action_type == 'continuous':
             a = sth.action_index2one_hot(a, self.a_dim_or_list)
         self.data.add(
             s.astype(np.float32),
             visual_s.astype(np.float32),
             a.astype(np.float32),
             r[:, np.newaxis].astype(np.float32),
             s_.astype(np.float32),
             visual_s_.astype(np.float32),
             done[:, np.newaxis].astype(np.float32)
         )
コード例 #14
0
ファイル: ppo.py プロジェクト: dongf17/RLs
 def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
     assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray"
     if not self.is_continuous:
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     self.data = self.data.append({
         's': s,
         'visual_s': visual_s,
         'a': a,
         'r': r,
         'done': done,
         'value': np.squeeze(self._get_value(s, visual_s).numpy()),
         'log_prob': self._get_log_prob(s, visual_s, a).numpy() + 1e-10
     }, ignore_index=True)
     self.s_ = s_
     self.visual_s_ = visual_s_
コード例 #15
0
 def learn(self, **kwargs):
     episode = kwargs['episode']
     for i in range(kwargs['step']):
         s, visual_s, a, old_prob, r, s_, visual_s_, done = self.data.sample()
         summaries, _ = self.sess.run([self.summaries, self.train_sequence], feed_dict={
             self.pl_visual_s: visual_s,
             self.pl_s: s,
             self.pl_a: a if self.action_type == 'continuous' else sth.action_index2one_hot(a, self.a_dim_or_list),
             self.old_prob: old_prob,
             self.pl_r: r,
             self.pl_visual_s_: visual_s_,
             self.pl_s_: s_,
             self.pl_done: done,
             self.episode: episode,
             self.sigma_offset: np.full(self.a_counts, 0.01)
         })
         self.recorder.writer.add_summary(summaries, self.sess.run(self.global_step))
コード例 #16
0
 def off_store(self, s, visual_s, a, r, s_, visual_s_, done):
     """
     for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
     """
     assert isinstance(a, np.ndarray), "off_store need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "off_store need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray"
     if not self.action_type == 'continuous':
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     self.data.add(
         s.astype(np.float32),
         visual_s.astype(np.float32),
         a.astype(np.float32),
         r.astype(np.float32),
         s_.astype(np.float32),
         visual_s_.astype(np.float32),
         done.astype(np.float32)
     )
コード例 #17
0
 def on_store(self, s, visual_s, a, r, s_, visual_s_, done):
     """
     for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas.
     """
     assert isinstance(a, np.ndarray), "on_store need action type is np.ndarray"
     assert isinstance(r, np.ndarray), "on_store need reward type is np.ndarray"
     assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray"
     if not self.action_type == 'continuous':
         a = sth.action_index2one_hot(a, self.a_dim_or_list)
     self.data = self.data.append({
         's': s.astype(np.float32),
         'visual_s': visual_s.astype(np.float32),
         'a': a.astype(np.float32),
         'r': r.astype(np.float32),
         's_': s_.astype(np.float32),
         'visual_s_': visual_s_.astype(np.float32),
         'done': done.astype(np.float32)
     }, ignore_index=True)