Beispiel #1
0
 def get_reward(self, obs, acs):
     sess = U.get_session()
     if len(obs.shape) == 1:
         obs = np.expand_dims(obs, 0)
     if len(acs.shape) == 1:
         acs = np.expand_dims(acs, 0)
     feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
     reward = sess.run(self.reward_op, feed_dict)
     return reward
Beispiel #2
0
 def add_all_summary(self, writer, values, iter):
     # Note that the order of the incoming ```values``` should be the same as the that of the
     #            ```scalar_keys``` given in ```__init__```
     if np.sum(np.isnan(values) + 0) != 0:
         return
     sess = U.get_session()
     keys = self.scalar_summaries_ph + self.histogram_summaries_ph
     feed_dict = {}
     for k, v in zip(keys, values):
         feed_dict.update({k: v})
     summaries_str = sess.run(self.summaries, feed_dict)
     writer.add_summary(summaries_str, iter)
Beispiel #3
0
    def get_reward(self, obs, acs, last_acs):
        sess = U.get_session()
        # if len(obs.shape) == 1:
        #   obs = np.expand_dims(obs, 0)
        # if len(acs.shape) == 1:
        #   acs = np.expand_dims(acs, 0)

        one_hot_acs = []
        if type(acs) is np.ndarray:
            depth = acs.size
            one_hot_acs = np.zeros((depth, 524))
            one_hot_acs[np.arange(depth), acs] = 1
        else:
            # one_hot_acs = tf.one_hot(indices, depth)
            one_hot_acs = np.zeros(524)
            one_hot_acs[acs] = 1
            one_hot_acs = [one_hot_acs]

        one_hot_last_acs = []
        if type(last_acs) is np.ndarray:
            depth = last_acs.size
            one_hot_last_acs = np.zeros((depth, 524))
            one_hot_last_acs[np.arange(depth), last_acs] = 1
        else:
            one_hot_last_acs = np.zeros(524)
            one_hot_last_acs[last_acs] = 1
            one_hot_last_acs = [one_hot_last_acs]

        feed_dict = {
            self.generator_obs_ph: obs,
            self.generator_acs_ph: one_hot_acs,
            self.generator_last_action_ph: one_hot_last_acs
        }
        # g_acc = sess.run(self.generator_acc, feed_dict)
        # reward = 0
        # if g_acc > 0.99:
        #   reward = np.log(1-g_acc+1e-5) # give negative reward
        # else:
        reward = sess.run(self.reward_op, feed_dict)

        if acs in [0, 1, 2, 3, 4, 274]:
            reward /= 2

        # if reward < 0.01:
        #   # give negative reward
        #   reward = 20 * reward - 0.2

        # if np.allclose(reward, 0):
        #   reward = -1
        # if reward == 0:
        #   print('reward should not equal to 0!!!!!')
        return reward
Beispiel #4
0
 def get_reward(self, trajs, trajs_len, dropout_keep_prob=1.0):
     sess = U.get_session()
     if len(trajs.shape) == 2:
         trajs = np.expand_dims(trajs, 0)
     if len(np.shape(trajs_len)) == 0:
         trajs_len = np.expand_dims(trajs_len, 0)
     feed_dict = {
         self.generator_traj_ph: trajs,
         self.generator_traj_seq_len: trajs_len,
         self.dropout_keep_prob: dropout_keep_prob
     }
     rewards = sess.run(self.rewards_op, feed_dict)
     return rewards