def step(self, action, **kwargs): r = 0.0 merge_info = {} for k in range(self.frame_skip): self.frame_count += 1 obs, reward, done, info = self.env.step(action, **kwargs) r += reward for key in info.keys(): if 'reward' in key: # to assure that we don't igonre other reward # if new reward was added, consider its logic here assert (key == 'shaping_reward') or ( key == 'env_reward') or (key == 'x_offset_reward') merge_info[key] = merge_info.get(key, 0.0) + info[key] else: merge_info[key] = info[key] if info['target_changed']: logger.warn("[FrameSkip] early break since target was changed") break if done: break merge_info['frame_count'] = self.frame_count return obs, r, done, merge_info
def load(self, pathname): data = np.load(pathname) other = data['other'] if int(other[0]) > self.max_size: logger.warn('loading from a bigger size rpm!') self._curr_size = min(int(other[0]), self.max_size) self._curr_pos = min(int(other[1]), self.max_size - 1) self.obs[:self._curr_size] = data['obs'][:self._curr_size] self.action[:self._curr_size] = data['action'][:self._curr_size] self.reward[:self._curr_size] = data['reward'][:self._curr_size] self.terminal[:self._curr_size] = data['terminal'][:self._curr_size] self.next_obs[:self._curr_size] = data['next_obs'][:self._curr_size] logger.info("[load rpm]memory loade from {}".format(pathname))
def __init__(self, args): if machine_info.is_gpu_available(): assert get_gpu_count() == 1, 'Only support training in single GPU,\ Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' else: cpu_num = os.environ.get('CPU_NUM') assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\ Please set environment variable: `export CPU_NUM=1`.' model = OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM) algorithm = parl.algorithms.DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) self.agent = OpenSimAgent(algorithm, OBS_DIM, ACT_DIM) self.evaluate_result = [] self.lock = threading.Lock() self.model_lock = threading.Lock() self.model_queue = queue.Queue() self.best_shaping_reward = 0 self.best_env_reward = 0 if args.offline_evaluate: self.offline_evaluate() else: t = threading.Thread(target=self.online_evaluate) t.start() with self.lock: while True: model_path = self.model_queue.get() if not args.offline_evaluate: # online evaluate while not self.model_queue.empty(): model_path = self.model_queue.get() try: self.agent.restore(model_path) break except Exception as e: logger.warn("Agent restore Exception: {} ".format(e)) self.cur_model = model_path self.create_actors()
def step(self, action, **kwargs): self.step_count += 1 obs, r, done, info = self.env.step(action, **kwargs) info = self.reward_shaping(obs, r, done, action) if info['target_vel'] > 2.75: rate = math.sqrt((2.75**2) / (info['target_vel']**2)) logger.warn('Changing targets, origin targets: {}'.format( obs['target_vel'])) obs['target_vel'][0] = obs['target_vel'][0] * rate obs['target_vel'][2] = obs['target_vel'][2] * rate logger.warn('Changing targets, new targets: {}'.format( obs['target_vel'])) info['target_vel'] = 2.75 if info['target_vel'] < -0.25: rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2)) logger.warn('Changing targets, origin targets: {}'.format( obs['target_vel'])) obs['target_vel'][0] = obs['target_vel'][0] * rate obs['target_vel'][2] = obs['target_vel'][2] * rate logger.warn('Changing targets, new targets: {}'.format( obs['target_vel'])) info['target_vel'] = -0.25 delta = 0 if self.last_target_vel is not None: delta = np.absolute( np.array(self.last_target_vel) - np.array(obs['target_vel'])) if (self.last_target_vel is None) or np.all(delta < 1e-5): info['target_changed'] = False else: info['target_changed'] = True logger.info("[env_wrapper] target_changed, vx:{} vz:{}".format( obs['target_vel'][0], obs['target_vel'][2])) self.last_target_change_step = self.step_count self.target_change_times += 1 info['target_change_times'] = self.target_change_times self.last_target_vel = obs['target_vel'] assert 'shaping_reward' in info timeout = False if self.step_count >= MAXTIME_LIMIT: timeout = True if done and not timeout: # penalty for falling down info['shaping_reward'] += FALL_PENALTY info['timeout'] = timeout self.pre_state_desc = obs return obs, r, done, info
def _parse_memory(self, actor_state): mem = actor_state.memory n = len(mem) episode_shaping_reward = np.sum( [exp.info['shaping_reward'] for exp in mem]) episode_env_reward = np.sum([exp.info['env_reward'] for exp in mem]) with self.lock: if actor_state.model_name == self.cur_model: self.evaluate_result.append({ 'shaping_reward': episode_shaping_reward, 'env_reward': episode_env_reward, 'episode_length': mem[-1].info['frame_count'], 'falldown': not mem[-1].info['timeout'], }) logger.info('{}, finish_cnt: {}'.format( self.cur_model, len(self.evaluate_result))) logger.info('{}'.format(self.evaluate_result[-1])) if len(self.evaluate_result) >= args.evaluate_times: mean_value = {} for key in self.evaluate_result[0].keys(): mean_value[key] = np.mean( [x[key] for x in self.evaluate_result]) logger.info('Model: {}, mean_value: {}'.format( self.cur_model, mean_value)) eval_num = len(self.evaluate_result) falldown_num = len( [x for x in self.evaluate_result if x['falldown']]) falldown_rate = falldown_num / eval_num logger.info('Falldown rate: {}'.format(falldown_rate)) for key in self.evaluate_result[0].keys(): mean_value[key] = np.mean([ x[key] for x in self.evaluate_result if not x['falldown'] ]) logger.info( 'Model: {}, Exclude falldown, mean_value: {}'.format( self.cur_model, mean_value)) if mean_value['shaping_reward'] > self.best_shaping_reward: self.best_shaping_reward = mean_value['shaping_reward'] copy2(self.cur_model, './model_zoo') logger.info( "[best shaping reward updated:{}] path:{}".format( self.best_shaping_reward, self.cur_model)) if mean_value[ 'env_reward'] > self.best_env_reward and falldown_rate < 0.3: self.best_env_reward = mean_value['env_reward'] copy2(self.cur_model, './model_zoo') logger.info( "[best env reward updated:{}] path:{}, falldown rate: {}" .format(self.best_env_reward, self.cur_model, falldown_num / eval_num)) self.evaluate_result = [] while True: model_path = self.model_queue.get() if not args.offline_evaluate: # online evaluate while not self.model_queue.empty(): model_path = self.model_queue.get() try: self.agent.restore(model_path) break except Exception as e: logger.warn( "Agent restore Exception: {} ".format(e)) self.cur_model = model_path else: actor_state.model_name = self.cur_model actor_state.reset()