Beispiel #1
0
    def step(self, action, **kwargs):
        r = 0.0
        merge_info = {}
        for k in range(self.frame_skip):
            self.frame_count += 1
            obs, reward, done, info = self.env.step(action, **kwargs)
            r += reward

            for key in info.keys():
                if 'reward' in key:
                    # to assure that we don't igonre other reward
                    # if new reward was added, consider its logic here
                    assert (key == 'shaping_reward') or (
                        key == 'env_reward') or (key == 'x_offset_reward')
                    merge_info[key] = merge_info.get(key, 0.0) + info[key]
                else:
                    merge_info[key] = info[key]

            if info['target_changed']:
                logger.warn("[FrameSkip] early break since target was changed")
                break

            if done:
                break
        merge_info['frame_count'] = self.frame_count
        return obs, r, done, merge_info
Beispiel #2
0
    def load(self, pathname):
        data = np.load(pathname)
        other = data['other']
        if int(other[0]) > self.max_size:
            logger.warn('loading from a bigger size rpm!')
        self._curr_size = min(int(other[0]), self.max_size)
        self._curr_pos = min(int(other[1]), self.max_size - 1)

        self.obs[:self._curr_size] = data['obs'][:self._curr_size]
        self.action[:self._curr_size] = data['action'][:self._curr_size]
        self.reward[:self._curr_size] = data['reward'][:self._curr_size]
        self.terminal[:self._curr_size] = data['terminal'][:self._curr_size]
        self.next_obs[:self._curr_size] = data['next_obs'][:self._curr_size]
        logger.info("[load rpm]memory loade from {}".format(pathname))
Beispiel #3
0
    def __init__(self, args):
        if machine_info.is_gpu_available():
            assert get_gpu_count() == 1, 'Only support training in single GPU,\
                    Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'

        else:
            cpu_num = os.environ.get('CPU_NUM')
            assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\
                    Please set environment variable:  `export CPU_NUM=1`.'

        model = OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM)
        algorithm = parl.algorithms.DDPG(
            model,
            gamma=GAMMA,
            tau=TAU,
            actor_lr=ACTOR_LR,
            critic_lr=CRITIC_LR)
        self.agent = OpenSimAgent(algorithm, OBS_DIM, ACT_DIM)

        self.evaluate_result = []

        self.lock = threading.Lock()
        self.model_lock = threading.Lock()
        self.model_queue = queue.Queue()

        self.best_shaping_reward = 0
        self.best_env_reward = 0

        if args.offline_evaluate:
            self.offline_evaluate()
        else:
            t = threading.Thread(target=self.online_evaluate)
            t.start()

        with self.lock:
            while True:
                model_path = self.model_queue.get()
                if not args.offline_evaluate:
                    # online evaluate
                    while not self.model_queue.empty():
                        model_path = self.model_queue.get()
                try:
                    self.agent.restore(model_path)
                    break
                except Exception as e:
                    logger.warn("Agent restore Exception: {} ".format(e))

            self.cur_model = model_path

        self.create_actors()
Beispiel #4
0
    def step(self, action, **kwargs):
        self.step_count += 1
        obs, r, done, info = self.env.step(action, **kwargs)
        info = self.reward_shaping(obs, r, done, action)
        if info['target_vel'] > 2.75:
            rate = math.sqrt((2.75**2) / (info['target_vel']**2))
            logger.warn('Changing targets, origin targets: {}'.format(
                obs['target_vel']))
            obs['target_vel'][0] = obs['target_vel'][0] * rate
            obs['target_vel'][2] = obs['target_vel'][2] * rate
            logger.warn('Changing targets, new targets: {}'.format(
                obs['target_vel']))
            info['target_vel'] = 2.75
        if info['target_vel'] < -0.25:
            rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2))
            logger.warn('Changing targets, origin targets: {}'.format(
                obs['target_vel']))
            obs['target_vel'][0] = obs['target_vel'][0] * rate
            obs['target_vel'][2] = obs['target_vel'][2] * rate
            logger.warn('Changing targets, new targets: {}'.format(
                obs['target_vel']))
            info['target_vel'] = -0.25

        delta = 0
        if self.last_target_vel is not None:
            delta = np.absolute(
                np.array(self.last_target_vel) - np.array(obs['target_vel']))
        if (self.last_target_vel is None) or np.all(delta < 1e-5):
            info['target_changed'] = False
        else:
            info['target_changed'] = True
            logger.info("[env_wrapper] target_changed, vx:{}   vz:{}".format(
                obs['target_vel'][0], obs['target_vel'][2]))
            self.last_target_change_step = self.step_count
            self.target_change_times += 1
        info['target_change_times'] = self.target_change_times
        self.last_target_vel = obs['target_vel']

        assert 'shaping_reward' in info
        timeout = False
        if self.step_count >= MAXTIME_LIMIT:
            timeout = True
        if done and not timeout:
            # penalty for falling down
            info['shaping_reward'] += FALL_PENALTY
        info['timeout'] = timeout
        self.pre_state_desc = obs
        return obs, r, done, info
Beispiel #5
0
    def _parse_memory(self, actor_state):
        mem = actor_state.memory
        n = len(mem)
        episode_shaping_reward = np.sum(
            [exp.info['shaping_reward'] for exp in mem])
        episode_env_reward = np.sum([exp.info['env_reward'] for exp in mem])

        with self.lock:
            if actor_state.model_name == self.cur_model:
                self.evaluate_result.append({
                    'shaping_reward':
                    episode_shaping_reward,
                    'env_reward':
                    episode_env_reward,
                    'episode_length':
                    mem[-1].info['frame_count'],
                    'falldown':
                    not mem[-1].info['timeout'],
                })
                logger.info('{}, finish_cnt: {}'.format(
                    self.cur_model, len(self.evaluate_result)))
                logger.info('{}'.format(self.evaluate_result[-1]))
                if len(self.evaluate_result) >= args.evaluate_times:
                    mean_value = {}
                    for key in self.evaluate_result[0].keys():
                        mean_value[key] = np.mean(
                            [x[key] for x in self.evaluate_result])
                    logger.info('Model: {}, mean_value: {}'.format(
                        self.cur_model, mean_value))

                    eval_num = len(self.evaluate_result)
                    falldown_num = len(
                        [x for x in self.evaluate_result if x['falldown']])
                    falldown_rate = falldown_num / eval_num
                    logger.info('Falldown rate: {}'.format(falldown_rate))
                    for key in self.evaluate_result[0].keys():
                        mean_value[key] = np.mean([
                            x[key] for x in self.evaluate_result
                            if not x['falldown']
                        ])
                    logger.info(
                        'Model: {}, Exclude falldown, mean_value: {}'.format(
                            self.cur_model, mean_value))
                    if mean_value['shaping_reward'] > self.best_shaping_reward:
                        self.best_shaping_reward = mean_value['shaping_reward']
                        copy2(self.cur_model, './model_zoo')
                        logger.info(
                            "[best shaping reward updated:{}] path:{}".format(
                                self.best_shaping_reward, self.cur_model))
                    if mean_value[
                            'env_reward'] > self.best_env_reward and falldown_rate < 0.3:
                        self.best_env_reward = mean_value['env_reward']
                        copy2(self.cur_model, './model_zoo')
                        logger.info(
                            "[best env reward updated:{}] path:{}, falldown rate: {}"
                            .format(self.best_env_reward, self.cur_model,
                                    falldown_num / eval_num))

                    self.evaluate_result = []
                    while True:
                        model_path = self.model_queue.get()
                        if not args.offline_evaluate:
                            # online evaluate
                            while not self.model_queue.empty():
                                model_path = self.model_queue.get()
                        try:
                            self.agent.restore(model_path)
                            break
                        except Exception as e:
                            logger.warn(
                                "Agent restore Exception: {} ".format(e))
                    self.cur_model = model_path
            else:
                actor_state.model_name = self.cur_model
        actor_state.reset()