def update_extended_observation(self):
        self.obs = self.robot.get_robot_observation()

        # self.obs = np.concatenate((self.obs, [np.minimum(self.timer / 500, self.max_tar_vel)]))  # TODO

        if self.obs_noise:
            self.obs = utils.perturb(self.obs, 0.1, self.np_random)

        if self.train_dyn:
            self.behavior_obs_len = len(self.obs)

            obs_nn = utils.wrap(self.obs, is_cuda=self.cuda_env)
            with torch.no_grad():
                _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act(
                    obs_nn,
                    self.recurrent_hidden_states,
                    self.masks,
                    deterministic=False  # TODO, det pi
                )
            action = utils.unwrap(action_nn, is_cuda=self.cuda_env)

            if self.enlarge_act_range:
                # 15% noise if a clipped to -1, 1
                action = utils.perturb(action, 0.15, self.np_random)

            self.behavior_act_len = len(action)

            self.obs = np.concatenate((self.obs, action))
    def update_extended_observation(self):
        self.obs = self.robot.get_robot_observation(with_vel=True)

        # actually should /2 (downsample)
        self.obs.extend(
            list(np.mean(self.normal_forces[-self.control_skip:], axis=0)))

        if self.obs_noise:
            self.obs = utils.perturb(self.obs, 0.1, self.np_random)

        if self.train_dyn:
            obs_behavior = self.feat_select_func(self.obs)
            obs_nn = utils.wrap(obs_behavior, is_cuda=self.cuda_env)
            with torch.no_grad():
                _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act(
                    obs_nn,
                    self.recurrent_hidden_states,
                    self.masks,
                    deterministic=False)
            action = utils.unwrap(action_nn, is_cuda=self.cuda_env)

            # 25% noise if a clipped to -1, 1
            action = utils.perturb(action, self.enlarge_act_range,
                                   self.np_random)
            action = np.clip(action, -1.0, 1.0)
            self.behavior_act_len = len(action)

            self.obs = list(self.obs) + list(action)
    def get_extended_observation(self):

        # with vel false
        cur_state = self.robot.get_robot_observation(with_vel=False)

        if self.obs_noise:
            cur_state = utils.perturb(cur_state, 0.1, self.np_random)

        # then update past obs
        utils.push_recent_value(self.past_obs_array, cur_state)

        # then construct behavior obs
        b_obs_all = utils.select_and_merge_from_s_a(
            s_mt=list(self.past_obs_array),
            a_mt=list(self.past_bact_array),
            s_idx=self.behavior_past_obs_t_idx,
            a_idx=np.array([]))
        # if train motor, return behavior obs and we are done
        if not self.train_dyn:
            return b_obs_all

        # else, train dyn
        # rollout b_pi
        obs_nn = utils.wrap(b_obs_all, is_cuda=self.cuda_env)
        with torch.no_grad():
            _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act(
                obs_nn,
                self.recurrent_hidden_states,
                self.masks,
                deterministic=False)
        b_cur_act = list(utils.unwrap(action_nn, is_cuda=self.cuda_env))
        b_cur_act = utils.perturb(b_cur_act, self.enlarge_act_range,
                                  self.np_random)
        b_cur_act = np.tanh(b_cur_act)

        # Store action after tanh (-1,1)
        utils.push_recent_value(self.past_bact_array, b_cur_act)

        #  construct G obs from updated past obs&b_act
        # g_obs_all = utils.select_and_merge_from_s_a(
        #     s_mt=list(self.past_obs_array),
        #     a_mt=list(self.past_bact_array),
        #     s_idx=self.generator_past_obs_t_idx,
        #     a_idx=self.generator_past_act_t_idx
        # )

        # calculate 4x split obs here
        g_obs_all = self.get_all_feet_local_obs(b_cur_act)

        return g_obs_all
Example #4
0
    def step(self, a):
        # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench
        if self.train_dyn:
            env_action = a
            robo_action = self.obs[-self.behavior_act_len:]
        else:
            robo_action = a
            robo_action = np.clip(robo_action, -1.0, 1.0)
            env_pi_obs = np.concatenate((self.obs, robo_action))
            env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env)
            with torch.no_grad():
                _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act(
                    env_pi_obs_nn,
                    self.recurrent_hidden_states,
                    self.masks,
                    deterministic=False)
            env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env)

            # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0)
            #
            # env_pi_obs_feat = self.feat_select_func(self.obs)
            # dis_state = np.concatenate((env_pi_obs_feat, robo_action))
            # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env)

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_0 = root_pos[0]

        # TODO
        # # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale
        if self.act_noise:
            robo_action = utils.perturb(robo_action, 0.05, self.np_random)

        # # TODO
        # if self.pretrain_dyn:
        #     # self.state_id = self._p.saveState()
        #     self.img_obs, pre_s_i = self.rollout_one_step_imaginary()     # takes the old self.obs
        #     # img_obs = self.rollout_one_step_imaginary_same_session()
        #     # self._p.restoreState(self.state_id)
        #     pre_s = self.robot.get_robot_raw_state_vec()
        #     # print(pre_s_i)
        #     # print(pre_s)
        #     assert np.allclose(pre_s, pre_s_i, atol=1e-5)

        for _ in range(self.control_skip):
            self.robot.apply_action(robo_action)
            self.apply_scale_clip_conf_from_pi_new(env_action)
            self._p.stepSimulation()
            if self.render:
                time.sleep(self._ts * 0.5)
            self.timer += 1

        self.update_extended_observation()

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_1 = root_pos[0]
        self.velx = (x_1 - x_0) / (self.control_skip * self._ts)

        y_1 = root_pos[1]
        height = root_pos[2]
        q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs)
        # print(np.max(np.abs(dq)))
        in_support = self.robot.is_root_com_in_support()

        if not self.pretrain_dyn:
            reward = self.ab  # alive bonus
            tar = np.minimum(self.timer / 500, self.max_tar_vel)
            reward += np.minimum(self.velx, tar) * self.vel_r_weight
            # print("v", self.velx, "tar", tar)
            reward += -self.energy_weight * np.square(robo_action).sum()
            # print("act norm", -self.energy_weight * np.square(a).sum())

            pos_mid = 0.5 * (self.robot.ll + self.robot.ul)
            q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll)
            joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97)
            reward += -self.jl_weight * joints_at_limit
            # print("jl", -self.jl_weight * joints_at_limit)

            reward += -np.minimum(
                np.sum(np.square(dq)) * self.dq_pen_weight, 5.0)
            weight = np.array([2.0, 1.0, 1.0] * 4)
            reward += -np.minimum(
                np.sum(np.square(q - self.robot.init_q) * weight) *
                self.q_pen_weight, 5.0)
            # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0))
            # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0))

            y_1 = root_pos[1]
            reward += -y_1 * 0.5
            # print("dev pen", -y_1*0.5)
        else:
            # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])])
            reward = 0  # TODO

        # print("______")
        # print(in_support)

        # print("h", height)
        # print("dq.", np.abs(dq))
        # print((np.abs(dq) < 50).all())

        # print("------")
        # conf policy will not have body-in-contact flag
        not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height <
                                                                   1.0)
        # not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) and in_support
        # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1)
        # not_done = True
        #
        # if not not_done:
        #     print(self.ratios.shape)
        #     print(np.mean(self.ratios, axis=0))

        # if not self.train_dyn:
        #     dis_action = self.feat_select_func(self.obs)
        #     dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env)
        #     d_score = self.discri.predict_prob_single_step(dis_state, dis_action)
        #     self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env))
        #     # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4:
        #     #     not_done = False
        #     # if not not_done or self.timer==1000:
        #     #     print(np.mean(self.d_scores))

        if not self.train_dyn:
            obs_behavior = self.feat_select_func(self.obs)
            return obs_behavior, reward, not not_done, {}
        return self.obs, reward, not not_done, {}
    def step(self, a):
        # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench
        if self.train_dyn:
            env_action = a
            robo_action = self.past_bact_array[0]  # after tanh
        else:
            robo_action = a
            robo_action = np.tanh(robo_action)
            # update past_bact after tanh
            utils.push_recent_value(self.past_bact_array, robo_action)

            env_pi_obs = utils.select_and_merge_from_s_a(
                s_mt=list(self.past_obs_array),
                a_mt=list(self.past_bact_array),
                s_idx=self.generator_past_obs_t_idx,
                a_idx=self.generator_past_act_t_idx)
            env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env)
            with torch.no_grad():
                _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act(
                    env_pi_obs_nn,
                    self.recurrent_hidden_states,
                    self.masks,
                    deterministic=False)
            env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env)

        # if self.ratio is None:
        #     self.ratio = np.array([env_action / robo_action])
        # else:
        #     self.ratio = np.append(self.ratio, [env_action / robo_action], axis=0)
        # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0)
        #
        # env_pi_obs_feat = self.feat_select_func(self.obs)
        # dis_state = np.concatenate((env_pi_obs_feat, robo_action))
        # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env)

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_0 = root_pos[0]

        # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale
        if self.act_noise:
            robo_action = utils.perturb(robo_action, 0.05, self.np_random)

        # when call info, should call before sim_step() as in v4 (append s_t+1 later)
        # info will be used to construct D input outside.
        past_info = self.construct_past_traj_window()

        # # TODO
        # if self.pretrain_dyn:
        #     # self.state_id = self._p.saveState()
        #     self.img_obs, pre_s_i = self.rollout_one_step_imaginary()     # takes the old self.obs
        #     # img_obs = self.rollout_one_step_imaginary_same_session()
        #     # self._p.restoreState(self.state_id)
        #     pre_s = self.robot.get_robot_raw_state_vec()
        #     # print(pre_s_i)
        #     # print(pre_s)
        #     assert np.allclose(pre_s, pre_s_i, atol=1e-5)

        for _ in range(self.control_skip):
            self.robot.apply_action(robo_action)
            self.apply_scale_clip_conf_from_pi_new(env_action)
            self._p.stepSimulation()
            if self.render:
                time.sleep(self._ts * 1.0)
            self.timer += 1

        obs_new = self.get_extended_observation()  # and update past_obs_array
        past_info += [self.past_obs_array[0]]  # s_t+1

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_1 = root_pos[0]
        self.velx = (x_1 - x_0) / (self.control_skip * self._ts)

        y_1 = root_pos[1]
        height = root_pos[2]
        q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs)
        # print(np.max(np.abs(dq)))
        # in_support = self.robot.is_root_com_in_support()

        if not self.pretrain_dyn:
            reward = self.ab  # alive bonus
            tar = np.minimum(self.timer / 500, self.max_tar_vel)
            reward += np.minimum(self.velx, tar) * self.vel_r_weight
            # print("v", self.velx, "tar", tar)
            reward += -self.energy_weight * np.square(robo_action).sum()
            # print("act norm", -self.energy_weight * np.square(a).sum())

            pos_mid = 0.5 * (self.robot.ll + self.robot.ul)
            q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll)
            joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97)
            reward += -self.jl_weight * joints_at_limit
            # print("jl", -self.jl_weight * joints_at_limit)

            reward += -np.minimum(
                np.sum(np.square(dq)) * self.dq_pen_weight, 5.0)
            weight = np.array([2.0, 1.0, 1.0] * 4)
            reward += -np.minimum(
                np.sum(np.square(q - self.robot.init_q) * weight) *
                self.q_pen_weight, 5.0)
            # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0))
            # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0))

            y_1 = root_pos[1]
            reward += -y_1 * 0.5
            # print("dev pen", -y_1*0.5)
        else:
            # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])])
            reward = 0  # TODO

        # print("______")
        # print(in_support)

        # print("h", height)
        # print("dq.", np.abs(dq))
        # print((np.abs(dq) < 50).all())

        # print("------")
        # conf policy will not have body-in-contact flag
        not_done = (np.abs(dq) < 90).all() and (height > 0.2) and (height <
                                                                   1.0)
        # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1)
        # not_done = True
        #
        # if not not_done:
        #     print(self.ratio.shape)
        #     labels = list("123456789ABC")
        #     data = self.ratio
        #     from matplotlib import pyplot as plt
        #     width = 0.4
        #     fig, ax = plt.subplots()
        #     for i, l in enumerate(labels):
        #         x = np.ones(data.shape[0]) * i + (np.random.rand(data.shape[0]) * width - width / 2.)
        #         ax.scatter(x, data[:, i], s=25)
        #         median = np.median(data[:, i])
        #         ax.plot([i - width / 2., i + width / 2.], [median, median], color="k")
        #
        #     plt.ylim(-5, 5)
        #     ax.set_xticks(range(len(labels)))
        #     ax.set_xticklabels(labels)
        #     plt.show()
        #     self.ratio = None

        # if not self.train_dyn:
        #     dis_action = self.feat_select_func(self.obs)
        #     dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env)
        #     d_score = self.discri.predict_prob_single_step(dis_state, dis_action)
        #     self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env))
        #     # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4:
        #     #     not_done = False
        #     # if not not_done or self.timer==1000:
        #     #     print(np.mean(self.d_scores))

        return obs_new, reward, not not_done, {"sas_window": past_info}
    def step(self, a):
        # TODO: in this env, env_action is 12D, being the scaling factors for torque command
        if self.train_dyn:
            env_action = a
            robo_action = self.obs[-self.behavior_act_len:]
        else:
            robo_action = a
            robo_action = np.clip(robo_action, -1.0, 1.0)
            env_pi_obs = np.concatenate((self.obs, robo_action))
            env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env)
            with torch.no_grad():
                _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act(
                    env_pi_obs_nn,
                    self.recurrent_hidden_states,
                    self.masks,
                    deterministic=False)
            env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env)

        # if self.ratio is None:
        #     self.ratio = np.array([env_action / robo_action])
        # else:
        #     self.ratio = np.append(self.ratio, [env_action / robo_action], axis=0)
        # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0)
        #
        # env_pi_obs_feat = self.feat_select_func(self.obs)
        # dis_state = np.concatenate((env_pi_obs_feat, robo_action))
        # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env)

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_0 = root_pos[0]

        # TODO
        # # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale
        # if self.act_noise:
        #     robo_action = utils.perturb(robo_action, 0.05, self.np_random)

        # # TODO
        # if self.pretrain_dyn:
        #     # self.state_id = self._p.saveState()
        #     self.img_obs, pre_s_i = self.rollout_one_step_imaginary()     # takes the old self.obs
        #     # img_obs = self.rollout_one_step_imaginary_same_session()
        #     # self._p.restoreState(self.state_id)
        #     pre_s = self.robot.get_robot_raw_state_vec()
        #     # print(pre_s_i)
        #     # print(pre_s)
        #     assert np.allclose(pre_s, pre_s_i, atol=1e-5)

        for _ in range(self.control_skip):
            self.robot.apply_action(env_action)
            self._p.stepSimulation()
            if self.render:
                time.sleep(self._ts * 0.5)
            self.timer += 1

            if self.timer % 2 == 0:
                cur_forces = []
                # last normal forces
                for foot in self.robot.feet:
                    cps = self._p.getContactPoints(self.robot.go_id,
                                                   self.floor_id, foot, -1)
                    normal_f_sum = 0
                    # print(len(cps))   # 0 or 1 or 2??
                    for cp in cps:
                        normal_f_sum += cp[9]  # normal force
                    cur_forces.extend([normal_f_sum / 500.0])
                self.normal_forces = np.append(self.normal_forces,
                                               [cur_forces],
                                               axis=0)

        self.update_extended_observation()

        root_pos, _ = self.robot.get_link_com_xyz_orn(-1)
        x_1 = root_pos[0]
        self.velx = (x_1 - x_0) / (self.control_skip * self._ts)

        y_1 = root_pos[1]
        height = root_pos[2]
        q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs)
        # print(np.max(np.abs(dq)))
        in_support = self.robot.is_root_com_in_support()

        if not self.pretrain_dyn:
            reward = self.ab  # alive bonus
            tar = np.minimum(self.timer / 500, self.max_tar_vel)
            reward += np.minimum(self.velx, tar) * self.vel_r_weight
            # print("v", self.velx, "tar", tar)
            reward += -self.energy_weight * np.square(robo_action).sum()
            # print("act norm", -self.energy_weight * np.square(a).sum())

            pos_mid = 0.5 * (self.robot.ll + self.robot.ul)
            q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll)
            joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97)
            reward += -self.jl_weight * joints_at_limit
            # print("jl", -self.jl_weight * joints_at_limit)

            reward += -np.minimum(
                np.sum(np.square(dq)) * self.dq_pen_weight, 5.0)
            weight = np.array([2.0, 1.0, 1.0] * 4)
            reward += -np.minimum(
                np.sum(np.square(q - self.robot.init_q) * weight) *
                self.q_pen_weight, 5.0)
            # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0))
            # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0))

            y_1 = root_pos[1]
            reward += -y_1 * 0.5
            # print("dev pen", -y_1*0.5)
        else:
            # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])])
            reward = 0  # TODO

        # print("______")
        # print(in_support)

        # print("h", height)
        # print("dq.", np.abs(dq))
        # print((np.abs(dq) < 50).all())

        # print("------")
        # conf policy will not have body-in-contact flag
        not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (
            height < 1.0) and in_support
        # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1)
        # not_done = True
        #
        # if not not_done:
        #     print(self.ratio.shape)
        #     labels = list("123456789ABC")
        #     data = self.ratio
        #     from matplotlib import pyplot as plt
        #     width = 0.4
        #     fig, ax = plt.subplots()
        #     for i, l in enumerate(labels):
        #         x = np.ones(data.shape[0]) * i + (np.random.rand(data.shape[0]) * width - width / 2.)
        #         ax.scatter(x, data[:, i], s=25)
        #         median = np.median(data[:, i])
        #         ax.plot([i - width / 2., i + width / 2.], [median, median], color="k")
        #
        #     plt.ylim(-5, 5)
        #     ax.set_xticks(range(len(labels)))
        #     ax.set_xticklabels(labels)
        #     plt.show()
        #     self.ratio = None

        # if not self.train_dyn:
        #     dis_action = self.feat_select_func(self.obs)
        #     dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env)
        #     d_score = self.discri.predict_prob_single_step(dis_state, dis_action)
        #     self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env))
        #     # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4:
        #     #     not_done = False
        #     # if not not_done or self.timer==1000:
        #     #     print(np.mean(self.d_scores))

        if not self.train_dyn:
            obs_behavior = self.feat_select_func(self.obs)
            return obs_behavior, reward, not not_done, {}
        return self.obs, reward, not not_done, {}
Example #7
0
    def step(self, a):
        if self.act_noise and a is not None:
            a = utils.perturb(a, 0.05, self.np_random)

        if not self.use_gen_dyn:
            for _ in range(self.control_skip):
                # action is in not -1,1
                if a is not None:
                    self.act = np.clip(a, -1.0, 1.0)
                    self.robot.apply_action(self.act)
                self._p.stepSimulation()
                if self.render:
                    time.sleep(self._ts * 0.5)
                self.timer += 1
            self.robot.update_x()
            self.update_extended_observation()
            obs_unnorm = np.array(self.obs) / self.robot.obs_scaling
        else:

            # gen_input = list(self.obs) + list(a) + [0.0] * (11+3)

            gen_input = list(self.obs) + list(a) + list(
                np.random.normal(0, 1, 14))  # TODO
            gen_input = utils.wrap(gen_input, is_cuda=False)  # TODO
            gen_output = self.gen_dyn(gen_input)
            gen_output = utils.unwrap(gen_output, is_cuda=False)

            self.obs = gen_output
            obs_unnorm = np.array(self.obs) / self.robot.obs_scaling
            self.robot.last_x = self.robot.x
            self.robot.x += obs_unnorm[5] * (self.control_skip * self._ts)

            if self.render:
                all_qs = [self.robot.x] + list(obs_unnorm[:5])
                all_qs[1] -= 1.5
                for ind in range(self.robot.n_total_dofs):
                    self._p.resetJointState(self.robot.hopper_id, ind,
                                            all_qs[ind], 0.0)
                time.sleep(self._ts * 5.0)

            if self.obs_noise:
                self.obs = utils.perturb(self.obs, 0.1, self.np_random)
            self.timer += self.control_skip

        reward = 2.0  # alive bonus
        reward += self.get_ave_dx()
        # print("v", self.get_ave_dx())
        reward += -0.1 * np.square(a).sum()
        # print("act norm", -0.1 * np.square(a).sum())

        q = np.array(obs_unnorm[2:5])
        pos_mid = 0.5 * (self.robot.ll + self.robot.ul)
        q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll)
        joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97)
        reward += -2.0 * joints_at_limit
        # print("jl", -2.0 * joints_at_limit)

        dq = np.array(obs_unnorm[8:11])
        reward -= np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0)  # almost like /23
        # print("vel pen", np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0))

        height = obs_unnorm[0]
        # ang = self._p.getJointState(self.robot.hopper_id, 2)[0]

        # print(joints_dq)
        # print(height)
        # print("ang", ang)
        not_done = (np.abs(dq) < 50).all() and (height > .3) and (height < 1.8)

        return self.obs, reward, not not_done, {}