def evaluate(self, state: np.ndarray, hidden: np.ndarray = None, done=None): state = totorch(state, self.device) hidden = totorch_many( *hidden, device=self.device) if hidden is not None else None with torch.no_grad(): policy, value, hidden = self.forward(state, hidden, done) return tonumpy(policy), tonumpy(value), tonumpy_many(*hidden)
def eval_state(self, state, loc): with torch.no_grad(): x, y = zip(*loc) x, y = torch.tensor(x).to(self.device), torch.tensor(y).to( self.device) state_torch = totorch(state, self.device) Qsa = self.model(state_torch, x, y) return tonumpy(Qsa)
def train(global_model, model, env, nsteps, num_episodes, ID): opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3) episode = 0 episode_steps = 0 episode_score = 0 T = 0 state = env.reset() start = time.time() while episode < num_episodes: rollout = [] for t in range(nsteps): with torch.no_grad(): policy, value = model(totorch(state[None], device='cpu')) policy, value = tonumpy(policy), tonumpy(value) action = np.random.choice(policy.shape[1], p=policy[0]) next_state, reward, done, info = env.step(action) episode_score += reward rollout.append((state, action, reward, value, done)) state = next_state T += 1 episode_steps += 1 if done or t == nsteps-1: states, actions, rewards, values, dones = stack_many(*zip(*rollout)) with torch.no_grad(): _, last_values = model.forward(totorch(next_state[None], device='cpu')) last_values = last_values.cpu().numpy() R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False) loss = update_params(model, global_model, opt, states, actions, R) #self.T += t if done: episode += 1 state = env.reset() if episode % 1 == 0: time_taken = time.time() - start print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}') episode_steps = 0 episode_score = 0 start = time.time() break
def intrinsic_reward(self, next_state: np.ndarray, state_mean: np.ndarray, state_std): next_state, state_mean, state_std = totorch_many(next_state, state_mean, state_std, device=self.device) with torch.no_grad(): intr_reward = self._intr_reward(next_state, state_mean, state_std) return tonumpy(intr_reward)
def get_pixel_control(self, state:np.ndarray): with torch.no_grad(): enc_state = self.policy.model(totorch(state, self.device)) Qaux = self.Qaux(enc_state) return tonumpy(Qaux)
def evaluate(self, state): with torch.no_grad(): policy, value_extr, value_intr = self.forward( totorch(state, self.device)) return tonumpy(policy), tonumpy(value_extr), tonumpy(value_intr)
def evaluate(self, state: np.ndarray): with torch.no_grad(): policy, _ = self.policy.forward(totorch(state, self.policy.device)) value = self.value.forward(totorch(state, self.value.device)) return tonumpy(policy), tonumpy(value)
def get_value(self, state: np.ndarray): with torch.no_grad(): value = self.value.forward(totorch(state, self.value.device)) return tonumpy(value)
def get_policy(self, state: np.ndarray): with torch.no_grad(): policy, Adv = self.policy.forward( totorch(state, self.policy.device)) return tonumpy(policy), tonumpy(Adv)
def evaluate(self, state: np.ndarray): state = totorch(state, self.device) with torch.no_grad(): policy, value = self.forward(state) return tonumpy(policy), tonumpy(value)
def get_pixel_control(self, state:np.ndarray, action_reward, hidden): state, action_reward, hidden = totorch(state, self.device), totorch(action_reward, self.device), totorch_many(*hidden, device=self.device) with torch.no_grad(): lstm_state, _ = self.policy.lstm_forward(state, action_reward, hidden, done=None) Qaux = self.Qaux(lstm_state) return tonumpy(Qaux)