Exemple #1
0
    def _calc_updated_vals(self, idx):
        r = self.replay_buffer.get('rewards', idx)

        if self.discount == 0:
            new_V = r
        else:
            next_idx = self.replay_buffer.get_next_idx(idx)
            s_next = self.replay_buffer.get('states', next_idx)
            g_next = self.replay_buffer.get(
                'goals', next_idx) if self.has_goal() else None
            #####################[
            wt_next = self.replay_buffer.get('wts', next_idx) if Settings.mode(
            ) == Mode.CWT_CNN_v1 or Settings.mode(
            ) == Mode.CWT_CNN_v2 else None
            #####################]

            is_end = self.replay_buffer.is_path_end(idx)
            is_fail = self.replay_buffer.check_terminal_flag(
                idx, Env.Terminate.Fail)
            is_succ = self.replay_buffer.check_terminal_flag(
                idx, Env.Terminate.Succ)
            is_fail = np.logical_and(is_end, is_fail)
            is_succ = np.logical_and(is_end, is_succ)

            V_next = self._eval_critic(s_next, g_next, wt_next)
            V_next[is_fail] = self.val_fail
            V_next[is_succ] = self.val_succ

            new_V = r + self.discount * V_next
        return new_V
Exemple #2
0
    def _update_actor(self):
        key = self.EXP_ACTION_FLAG
        idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size,
                                                 key)
        has_goal = self.has_goal()

        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if has_goal else None
        #####################[
        wt = self.replay_buffer.get('wts', idx) if Settings.mode(
        ) == Mode.CWT_CNN_v1 or Settings.mode() == Mode.CWT_CNN_v2 else None
        #####################]
        a = self.replay_buffer.get('actions', idx)

        V_new = self._calc_updated_vals(idx)
        V_old = self._eval_critic(s, g, wt)
        adv = V_new - V_old

        feed = {
            self.s_tf: s,
            self.g_tf: g,
            self.wt_tf: wt,
            self.a_tf: a,
            self.adv_tf: adv
        }

        loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf],
                                    feed)
        self.actor_solver.update(grads)

        return loss
Exemple #3
0
    def _build_net_critic(self, net_name):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]

        #####################[
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            cnn_network = NetBuilder.build_net(
                MyCNN.NAME, self.wt_tf, self.my_memory_buffer.channel_count)
            input_tfs += [cnn_network]
        #####################]

        h = NetBuilder.build_net(net_name, input_tfs)
        norm_val_tf = tf.layers.dense(
            inputs=h,
            units=1,
            activation=None,
            kernel_initializer=TFUtil.xavier_initializer)

        norm_val_tf = tf.reshape(norm_val_tf, [-1])
        val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
        return val_tf
Exemple #4
0
    def _build_net_actor(self, net_name, init_output_scale):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]

        #####################[
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            cnn_network = NetBuilder.build_net(
                MyCNN.NAME, self.wt_tf, self.my_memory_buffer.channel_count)
            input_tfs += [cnn_network]
        #####################]

        h = NetBuilder.build_net(net_name, input_tfs)
        norm_a_tf = tf.layers.dense(
            inputs=h,
            units=self.get_action_size(),
            activation=None,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-init_output_scale, maxval=init_output_scale))

        a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
        return a_tf
Exemple #5
0
    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (
            self.ACTOR_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size],
                                   name="s")  # observations
        self.tar_val_tf = tf.placeholder(tf.float32,
                                         shape=[None],
                                         name="tar_val")  # target value s
        self.adv_tf = tf.placeholder(tf.float32, shape=[None],
                                     name="adv")  # advantage
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size],
                                   name="a")  # target actions
        self.g_tf = tf.placeholder(
            tf.float32,
            shape=([None, g_size] if self.has_goal() else None),
            name="g")  # goals

        #####################[
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            self.wt_tf = tf.placeholder(tf.float32,
                                        shape=([
                                            None, self.my_wt.scale_count,
                                            self.my_memory_buffer.length,
                                            self.my_memory_buffer.channel_count
                                        ]),
                                        name="wt")
        else:
            self.wt_tf = tf.placeholder(tf.float32, shape=(None), name="wt")
        #####################]

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.actor_tf = self._build_net_actor(actor_net_name,
                                                      actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.actor_tf != None):
            Logger.print('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print('Built critic net: ' + critic_net_name)

        return
Exemple #6
0
    def _update_new_action(self):
        s = self._record_state()
        g = self._record_goal()

        #####################[
        #self.my_update_array.append(self.my_update_count)
        #self.my_update_count = 0
        wt = [[]]
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            self.my_memory_buffer.save(
                s[-self.my_memory_buffer.channel_count:])
            wt = [self.my_wt.calculate_cwt(self.my_memory_buffer)]
        #####################]

        if not (self._is_first_step()):
            r = self._record_reward()
            self.path.rewards.append(r)

        a, logp = self._decide_action(s=s, g=g, wt=wt)
        assert len(np.shape(a)) == 1
        assert len(np.shape(logp)) <= 1

        flags = self._record_flags()
        self._apply_action(a)

        self.path.states.append(s)
        self.path.goals.append(g)
        #####################[
        self.path.wts.append(wt[0])
        #####################]
        self.path.actions.append(a)
        self.path.logps.append(logp)
        self.path.flags.append(flags)

        if self._enable_draw():
            self._log_val(s, g, wt)

        return
Exemple #7
0
    def end_episode(self):
        if (self.path.pathlength() > 0):
            self._end_path()

            if (self._mode == self.Mode.TRAIN
                    or self._mode == self.Mode.TRAIN_END):
                if (self.enable_training and self.path.pathlength() > 0):
                    self._store_path(self.path)
            elif (self._mode == self.Mode.TEST):
                self._update_test_return(self.path)
            else:
                assert False, Logger.print("Unsupported RL agent mode" +
                                           str(self._mode))

            self._update_mode()

        #####################[
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            self.my_memory_buffer.reset()
        #####################]
        return
Exemple #8
0
    def _end_path(self):
        s = self._record_state()
        g = self._record_goal()
        #####################[
        wt = []
        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            # saves only a reduced set of state variables -> the last "len(channels)"
            self.my_memory_buffer.save(
                s[-self.my_memory_buffer.channel_count:])
            wt = self.my_wt.calculate_cwt(self.my_memory_buffer)
        #####################]
        r = self._record_reward()

        self.path.rewards.append(r)
        self.path.states.append(s)
        self.path.goals.append(g)
        #####################[
        self.path.wts.append(wt)
        #####################]
        self.path.terminate = self.world.env.check_terminate(self.id)

        return
Exemple #9
0
    def _update_critic(self):
        idx = self.replay_buffer.sample(self._local_mini_batch_size)
        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if self.has_goal() else None
        #####################[
        wt = self.replay_buffer.get('wts', idx) if Settings.mode(
        ) == Mode.CWT_CNN_v1 or Settings.mode() == Mode.CWT_CNN_v2 else None
        #####################]

        tar_V = self._calc_updated_vals(idx)
        tar_V = np.clip(tar_V, self.val_min, self.val_max)

        feed = {
            self.s_tf: s,
            self.g_tf: g,
            self.wt_tf: wt,
            self.tar_val_tf: tar_V
        }

        loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf],
                                    feed)
        self.critic_solver.update(grads)
        return loss
Exemple #10
0
    def _train(self):
        samples = self.replay_buffer.total_count
        self._total_sample_count = int(MPIUtil.reduce_sum(samples))
        end_training = False

        if (self.replay_buffer_initialized):
            if (self._valid_train_step()):
                prev_iter = self.iter
                iters = self._get_iters_per_update()
                avg_train_return = MPIUtil.reduce_avg(self.train_return)

                for i in range(iters):
                    curr_iter = self.iter
                    wall_time = time.time() - self.start_time
                    wall_time /= 60 * 60  # store time in hours

                    has_goal = self.has_goal()
                    s_mean = np.mean(self.s_norm.mean)
                    s_std = np.mean(self.s_norm.std)
                    g_mean = np.mean(self.g_norm.mean) if has_goal else 0
                    g_std = np.mean(self.g_norm.std) if has_goal else 0

                    self.logger.log_tabular("Iteration", self.iter)
                    self.logger.log_tabular("Wall_Time", wall_time)
                    self.logger.log_tabular("Samples",
                                            self._total_sample_count)
                    self.logger.log_tabular("Train_Return", avg_train_return)
                    self.logger.log_tabular("Test_Return",
                                            self.avg_test_return)
                    self.logger.log_tabular("State_Mean", s_mean)
                    self.logger.log_tabular("State_Std", s_std)
                    self.logger.log_tabular("Goal_Mean", g_mean)
                    self.logger.log_tabular("Goal_Std", g_std)
                    self._log_exp_params()

                    self._update_iter(self.iter + 1)
                    self._train_step()

                    Logger.print("Agent " + str(self.id))
                    self.logger.print_tabular()
                    Logger.print("")

                    if (self._enable_output()
                            and curr_iter % self.int_output_iters == 0):
                        self.logger.dump_tabular()

                if (prev_iter // self.int_output_iters !=
                        self.iter // self.int_output_iters):
                    end_training = self.enable_testing()

                #####################[
                if Settings.use_babe_support() and self.is_baby_support_on:
                    if self.avg_test_return > self.baby_support_threshold:
                        self._set_off_baby_support()
                #####################]
        else:

            Logger.print("Agent " + str(self.id))
            Logger.print("Samples: " + str(self._total_sample_count))
            Logger.print("")

            if (self._total_sample_count >= self.init_samples):
                self.replay_buffer_initialized = True
                end_training = self.enable_testing()

        if self._need_normalizer_update:
            self._update_normalizers()
            self._need_normalizer_update = self.normalizer_samples > self._total_sample_count

        if end_training:
            self._init_mode_train_end()

        return
Exemple #11
0
    def __init__(self, world, id, json_data):
        self.world = world
        self.id = id
        self.logger = Logger()
        self._mode = self.Mode.TRAIN

        assert self._check_action_space(), \
            Logger.print("Invalid action space, got {:s}".format(str(self.get_action_space())))

        self._enable_training = True
        self.path = Path()
        self.iter = int(0)
        self.start_time = time.time()
        self._update_counter = 0

        self.update_period = 1.0  # simulated time (seconds) before each training update
        self.iters_per_update = int(1)
        self.discount = 0.95
        self.mini_batch_size = int(32)
        self.replay_buffer_size = int(50000)
        self.init_samples = int(1000)
        self.normalizer_samples = np.inf
        self._local_mini_batch_size = self.mini_batch_size  # batch size for each work for multiprocessing
        self._need_normalizer_update = True
        self._total_sample_count = 0

        self._output_dir = ""
        self._int_output_dir = ""
        self.output_iters = 100
        self.int_output_iters = 100

        self.train_return = 0.0
        self.test_episodes = int(0)
        self.test_episode_count = int(0)
        self.test_return = 0.0
        self.avg_test_return = 0.0

        self.exp_anneal_samples = 320000
        self.exp_params_beg = ExpParams()
        self.exp_params_end = ExpParams()
        self.exp_params_curr = ExpParams()

        self._load_params(json_data)
        self._build_replay_buffer(self.replay_buffer_size)
        self._build_normalizers()
        self._build_bounds()
        self.reset()

        #####################[
        self.is_baby_support_on = False
        #  self.baby_support_threshold = baby_support_max_value * policy frequency (query_rate) * max_step_train_time
        self.baby_support_threshold = 0.8 * 30 * 20
        if Settings.use_babe_support():
            self._set_on_baby_support()

        if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode(
        ) == Mode.CWT_CNN_v2:
            channels = [
                "r_hip_w", "r_hip_x", "r_hip_y", "r_hip_z", "r_knee_rot",
                "r_shoulder_w", "r_shoulder_x", "r_shoulder_y", "r_shoulder_z",
                "r_elbow_rot", "l_hip_w", "l_hip_x", "l_hip_y", "l_hip_z",
                "r_knee_rot", "r_shoulder_w", "r_shoulder_x", "r_shoulder_y",
                "r_shoulder_z", "l_elbow_rot"
            ]
            memory_buffer_size = 16
            cache_size = 1
            self.my_memory_buffer = MyMemoryBuffer(channels,
                                                   memory_buffer_size,
                                                   cache_size)

            scale_min = 1
            scale_max = memory_buffer_size
            scale_count = memory_buffer_size  # square scalogram
            self.my_wt = MyWT(scale_min, scale_max, scale_count)

            # get test data
            #self.my_rawStateData = MyRawStateData(197, 1200)
        #####################]
        return