def post_body_init(self): '''Run init for components that need bodies to exist first, e.g. memory or architecture.''' self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e) logger.info(util.self_desc(self))
def multi_act_with_epsilon_greedy(nanflat_body_a, state_a, net, nanflat_epsilon_a): '''Multi-body nanflat_action_a on a single-pass from net. Uses epsilon-greedy but in a batch manner.''' nanflat_state_a = util.nanflatten(state_a) cat_state_a = np.concatenate(nanflat_state_a) nanflat_action_a = [] start_idx = 0 for body, e in zip(nanflat_body_a, nanflat_epsilon_a): logger.debug2(f'body: {body.aeb}, epsilon: {e}') end_idx = start_idx + body.action_dim if e > np.random.rand(): logger.debug2(f'Random action') action = np.random.randint(body.action_dim) else: logger.debug2(f'Greedy action') cat_state_a = cat_state_a.astype('float') torch_state = Variable(torch.from_numpy(cat_state_a).float()) out = net.wrap_eval(torch_state) action = int(torch.max(out[start_idx:end_idx], dim=0)[1][0]) nanflat_action_a.append(action) start_idx = end_idx logger.debug2(f''' body: {body.aeb}, net idx: {start_idx}-{end_idx} action: {action}''') return nanflat_action_a
def set_body_e(self, body_e): '''Method called by body_space.init_body_space to complete the necessary backward reference needed for EnvSpace to work''' self.body_e = body_e self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e)
def multi_head_act_with_epsilon_greedy(nanflat_body_a, state_a, net, nanflat_epsilon_a, gpu): '''Multi-headed body nanflat_action_a on a single-pass from net. Uses epsilon-greedy but in a batch manner.''' nanflat_state_a = util.nanflatten(state_a) nanflat_action_a = [] torch_states = [] for state in nanflat_state_a: state = state.astype('float') torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0)) if torch.cuda.is_available() and gpu: for torch_state in torch_states: torch_state = torch_state.cuda() for torch_state in torch_states: torch_state = Variable(torch_state) outs = net.wrap_eval(torch_states) for body, e, output in zip(nanflat_body_a, nanflat_epsilon_a, outs): logger.debug2(f'body: {body.aeb}, epsilon: {e}') if e > np.random.rand(): logger.debug2(f'Random action') action = np.random.randint(body.action_dim) else: logger.debug2(f'Greedy action') action = torch.max(output, dim=1)[1][0] nanflat_action_a.append(action) logger.debug2(f'epsilon: {e}, outputs: {output}, action: {action}') return nanflat_action_a
def multi_head_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu): nanflat_state_a = util.nanflatten(state_a) torch_states = [] for state in nanflat_state_a: state = state.astype('float') torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0)) if torch.cuda.is_available() and gpu: for torch_state in torch_states: torch_state = torch_state.cuda() for torch_state in torch_states: torch_state = Variable(torch_state) outs = net.wrap_eval(torch_states) out_with_temp = [torch.div(x, t) for x, t in zip(outs, nanflat_tau_a)] logger.debug2( f'taus: {nanflat_tau_a}, outs: {outs}, out_with_temp: {out_with_temp}') nanflat_action_a = [] for body, output in zip(nanflat_body_a, out_with_temp): probs = F.softmax(Variable(output.cpu()), dim=1).data.numpy()[0] action = np.random.choice(list(range(body.action_dim)), p=probs) logger.debug3(f''' body: {body.aeb}, output: {output}, probs: {probs}, action: {action}''') nanflat_action_a.append(action) return nanflat_action_a
def multi_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu): nanflat_state_a = util.nanflatten(state_a) cat_state_a = np.concatenate(nanflat_state_a).astype(float) torch_state = torch.from_numpy(cat_state_a).float() if torch.cuda.is_available() and gpu: torch_state = torch_state.cuda() torch_state = Variable(torch_state) out = net.wrap_eval(torch_state) nanflat_action_a = [] start_idx = 0 logger.debug2(f'taus: {nanflat_tau_a}') for body, tau in zip(nanflat_body_a, nanflat_tau_a): end_idx = start_idx + body.action_dim out_with_temp = torch.div(out[start_idx:end_idx], tau) logger.debug3(f''' tau: {tau}, out: {out}, out select: {out[start_idx: end_idx]}, out with temp: {out_with_temp}''') probs = F.softmax(Variable(out_with_temp.cpu()), dim=0).data.numpy() action = np.random.choice(list(range(body.action_dim)), p=probs) logger.debug3(f''' body: {body.aeb}, net idx: {start_idx}-{end_idx} probs: {probs}, action: {action}''') nanflat_action_a.append(action) start_idx = end_idx return nanflat_action_a
def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e
def space_step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.space_reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self._get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t) logger.debug( f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}' ) return reward_e, state_e, done_e
def space_init(self, agent_space, body_a, global_nets): '''Post init override for space env. Note that aeb is already correct from __init__''' self.agent_space = agent_space self.body_a = body_a self.aeb_space = agent_space.aeb_space self.nanflat_body_a = util.nanflatten(self.body_a) for idx, body in enumerate(self.nanflat_body_a): if idx == 0: # NOTE set default body self.body = body body.agent = self body.nanflat_a_idx = idx MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name')) body.memory = MemoryClass(self.agent_spec['memory'], body) self.body_num = len(self.nanflat_body_a) AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name')) self.algorithm = AlgorithmClass(self, global_nets) # after algo init, transfer any missing variables from default body for idx, body in enumerate(self.nanflat_body_a): for k, v in vars(self.body).items(): if util.gen_isnan(getattr(body, k, None)): setattr(body, k, v)
def test_nanflatten(arr, res): arr = np.array(arr) res = np.array(res) assert np.array_equal(util.nanflatten(arr), res)