def update(self, action_a, reward_a, state_a, done_a): ''' Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net ''' for (e, b), body in util.ndenumerate_nonan(self.body_a): body.memory.update(action_a[(e, b)], reward_a[(e, b)], state_a[(e, b)], done_a[(e, b)]) loss_a = self.algorithm.train() loss_a = util.guard_data_a(self, loss_a, 'loss') for (e, b), body in util.ndenumerate_nonan(self.body_a): body.loss = loss_a[(e, b)] explore_var_a = self.algorithm.update() explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var') return loss_a, explore_var_a
def space_step(self, action_e): action = action_e[(0, 0)] # single body if self.done: # space envs run continually without a central reset signal return self.space_reset() if not self.is_discrete: action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): reward_e[ab] = reward state_e[ab] = state done_e[ab] = done logger.debug( f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return reward_e, state_e, done_e
def act(self, state_a): '''Interface-level agent act method for all its bodies. Resolves state to state; get action and compose into action.''' data_names = ['action'] action_a, = self.agent.agent_space.aeb_space.init_data_s(data_names, a=self.agent.a) for (e, b), body in util.ndenumerate_nonan(self.agent.body_a): state = state_a[(e, b)] action_a[(e, b)] = self.body_act(body, state) return action_a
def update(self, action_a, reward_a, state_a, done_a): ''' Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net ''' for (e, b), body in util.ndenumerate_nonan(self.body_a): body.memory.update(action_a[(e, b)], reward_a[(e, b)], state_a[(e, b)], done_a[(e, b)]) # TODO finer loss and explore_var per body loss = self.algorithm.train() explore_var = self.algorithm.update() data_names = ['loss', 'explore_var'] loss_a, explore_var_a = self.agent_space.aeb_space.init_data_s( data_names, a=self.a) for (e, b), body in util.ndenumerate_nonan(self.body_a): loss_a[(e, b)] = loss explore_var_a[(e, b)] = explore_var return loss_a, explore_var_a
def get_session_data(session): ''' Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate. @returns {dict, dict} session_mdp_data, session_data ''' session_data = {} for aeb, body in util.ndenumerate_nonan(session.aeb_space.body_space.data): session_data[aeb] = body.df.copy() return session_data
def space_reset(self): _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[ab] = state done_e[ab] = self.done = False if util.to_render(): self.u_env.render() logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}') return _reward_e, state_e, done_e
def space_update(self, action_a, reward_a, state_a, done_a): '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net''' for eb, body in util.ndenumerate_nonan(self.body_a): body.action_pd_update() body.memory.update(action_a[eb], reward_a[eb], state_a[eb], done_a[eb]) loss_a = self.algorithm.space_train() loss_a = util.guard_data_a(self, loss_a, 'loss') for eb, body in util.ndenumerate_nonan(self.body_a): if not np.isnan(loss_a[eb]): # set for log_summary() body.loss = loss_a[eb] explore_var_a = self.algorithm.space_update() explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var') logger.debug( f'Agent {self.a} loss: {loss_a}, explore_var_a {explore_var_a}') for eb, body in util.ndenumerate_nonan(self.body_a): if body.env.done: body.epi_update() return loss_a, explore_var_a
def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done return _reward_e, state_e, done_e
def space_act(self, state_a): '''Interface-level agent act method for all its bodies. Resolves state to state; get action and compose into action.''' data_names = ('action',) action_a, = self.agent.agent_space.aeb_space.init_data_s(data_names, a=self.agent.a) for eb, body in util.ndenumerate_nonan(self.agent.body_a): state = state_a[eb] self.body = body action_a[eb] = self.act(state) # set body reference back to default self.body = self.agent.nanflat_body_a[0] return action_a
def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=self.train_mode, config=self.spec.get('unity')) _reward_e, state_e, _done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state_e[(a, b)] = env_info_a.states[b] return _reward_e, state_e, _done_e
def space_reset(self): self._check_u_brain_to_agent() self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self._get_env_info(env_info_dict, a) self._check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}') return _reward_e, state_e, done_e
def get_session_data(session, body_df_kind='eval', tmp_space_session_sub=False): ''' Gather data from session from all the bodies Depending on body_df_kind, will use eval_df or train_df ''' session_data = {} for aeb, body in util.ndenumerate_nonan(session.aeb_space.body_space.data): aeb_df = body.eval_df if body_df_kind == 'eval' else body.train_df # TODO tmp substitution since SpaceSession does not have run_eval_episode yet if tmp_space_session_sub: aeb_df = body.train_df session_data[aeb] = aeb_df.copy() return session_data
def reset(self): self.done = False _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[(a, b)] = state done_e[(a, b)] = self.done # TODO internalize render code if util.get_lab_mode() == 'dev': self.u_env.render() non_nan_cnt = util.count_nonan(state_e.flatten()) assert non_nan_cnt == 1, 'OpenAI Gym supports only single body' return _reward_e, state_e, done_e
def reset(self): self.done = False _reward_e, state_e, _done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[(a, b)] = state # TODO internalize render code if not self.train_mode: self.u_env.render() non_nan_cnt = util.count_nonan(state_e.flatten()) assert non_nan_cnt == 1, 'OpenAI Gym supports only single body' return _reward_e, state_e, _done_e
def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e
def space_act(self, state_a): '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing act() via iteration''' # gather and flatten states = [] for eb, body in util.ndenumerate_nonan(self.agent.body_a): state = state_a[eb] if self.normalize_state: state = policy_util.update_online_stats_and_normalize_state(body, state) states.append(state) xs = [torch.from_numpy(state).float() for state in states] pdparam = self.calc_pdparam(xs, evaluate=False) # use multi-policy. note arg change action_a, action_pd_a = self.action_policy(states, self, self.agent.nanflat_body_a, pdparam) for idx, body in enumerate(self.agent.nanflat_body_a): body.action_tensor, body.action_pd = action_a[idx], action_pd_a[idx] # used for body.action_pd_update later return action_a.cpu().numpy()
def reset(self): self.done = False _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[(a, b)] = state done_e[(a, b)] = self.done if util.get_lab_mode() == 'dev': self.u_env.render() non_nan_cnt = util.count_nonan(state_e.flatten()) assert non_nan_cnt == 1, 'OpenAI Gym supports only single body' logger.debug( f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}' ) return _reward_e, state_e, done_e
def update(self, action_a, reward_a, state_a, done_a): ''' Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net ''' for (e, b), body in util.ndenumerate_nonan(self.body_a): body.memory.update(action_a[(e, b)], reward_a[(e, b)], state_a[(e, b)], done_a[(e, b)]) if self.len_state_buffer > 0: if len(body.state_buffer) == self.len_state_buffer: del body.state_buffer[0] body.state_buffer.append(state_a[(e, b)]) loss_a = self.algorithm.train() loss_a = util.guard_data_a(self, loss_a, 'loss') explore_var_a = self.algorithm.update() explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var') return loss_a, explore_var_a
def space_reset(self): _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[ab] = state done_e[ab] = self.done = False if util.to_render(): self.u_env.render() logger.debug( f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return _reward_e, state_e, done_e
def step(self, action_e): assert len(action_e) == 1, 'OpenAI Gym supports only single body' # TODO implement clock_speed: step only if self.clock.to_step() if self.done: # t will actually be 0 return self.reset() action = action_e[(0, 0)] (state, reward, done, _info) = self.u_env.step(action) if util.get_lab_mode() == 'dev': self.u_env.render() reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): reward_e[(a, b)] = reward state_e[(a, b)] = state done_e[(a, b)] = done self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e
def act(self, state_a): '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing body_act''' # gather and flatten states = [] for (e, b), body in util.ndenumerate_nonan(self.agent.body_a): state = state_a[(e, b)] states.append(state) state = torch.tensor(states).view(-1).unsqueeze_(0).float() if torch.cuda.is_available() and self.net.gpu: state = state.cuda() pdparam = self.calc_pdparam(state, evaluate=False) # use multi-policy. note arg change action_a, action_pd_a = self.action_policy(pdparam, self, self.body_list) for idx, body in enumerate(self.body_list): action_pd = action_pd_a[idx] body.entropies.append(action_pd.entropy()) body.log_probs.append(action_pd.log_prob(action_a[idx].float())) return action_a.cpu().numpy()
def space_step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.space_reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self._get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t) logger.debug( f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}' ) return reward_e, state_e, done_e
def space_act(self, state_a): '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing act() via iteration''' # gather and flatten states = [] for eb, body in util.ndenumerate_nonan(self.agent.body_a): state = state_a[eb] if self.normalize_state: state = policy_util.update_online_stats_and_normalize_state( body, state) states.append(state) state = torch.tensor( states, device=self.net.device).view(-1).unsqueeze_(0).float() pdparam = self.calc_pdparam(state, evaluate=False) # use multi-policy. note arg change action_a, action_pd_a = self.action_policy(states, self, self.agent.nanflat_body_a, pdparam) for idx, body in enumerate(self.agent.nanflat_body_a): action_pd = action_pd_a[idx] body.entropies.append(action_pd.entropy()) body.log_probs.append(action_pd.log_prob(action_a[idx].float())) assert not torch.isnan(body.log_probs[-1]) return action_a.cpu().numpy()
def test_ndenumerate_nonan(): arr = np.full((2, 3), np.nan, dtype=object) np.fill_diagonal(arr, 1) for (a, b), body in util.ndenumerate_nonan(arr): assert a == b assert body == 1
def space_reset(self, state_a): '''Do agent reset per session, such as memory pointer''' logger.debug(f'Agent {self.a} reset') for eb, body in util.ndenumerate_nonan(self.body_a): body.memory.epi_reset(state_a[eb])
def reset(self, state_a): '''Do agent reset per session, such as memory pointer''' for (e, b), body in util.ndenumerate_nonan(self.body_a): body.memory.epi_reset(state_a[(e, b)])