def collect_trajectories(self, tmax, nrand=5): # number of parallel instances n = len(self.envs.ps) # initialize returning lists and start the game! state_list = [] reward_list = [] prob_list = [] action_list = [] self.envs.reset() # start all parallel agents self.envs.step([1] * n) # perform nrand random steps for _ in range(nrand): fr1, re1, _, _ = self.envs.step( np.random.choice([pong_utils.RIGHT, pong_utils.LEFT], n)) fr2, re2, _, _ = self.envs.step([0] * n) for t in range(tmax): # prepare the input # preprocess_batch properly converts two frames into # shape (n, 2, 80, 80), the proper input for the policy # this is required when building CNN with pytorch batch_input = pong_utils.preprocess_batch([fr1, fr2]) # probs will only be used as the pi_old # no gradient propagation is needed # so we move it to the cpu probs_tensor = self.policy(batch_input).detach() m = Categorical(probs_tensor) action = m.sample().unsqueeze(1) probs = torch.gather(probs_tensor, 1, action) action = action.cpu().numpy() # advance the game (0=no action)a # we take one action and skip game forward fr1, re1, is_done, _ = self.envs.step(action + 4) fr2, re2, is_done, _ = self.envs.step([0] * n) reward = re1 + re2 # store the result state_list.append(batch_input) reward_list.append(reward) prob_list.append(probs) action_list.append(action) # stop if any of the trajectories is done # we want all the lists to be retangular if is_done.any(): break # return pi_theta, states, actions, rewards, probability return prob_list, state_list, \ action_list, reward_list
def act(self, policy): batch_input = pong_utils.preprocess_batch([self.f1, self.f2]) probs = policy(batch_input) dist = torch.distributions.Categorical(probs=probs) action = dist.sample if action == 0: moving = [0, -5] elif action == 1: moving = [4, -3] else: moving = [-4, -3] return moving
def test_env(vis=False): f1 = env.reset() f2, _, _, _ = env.step(0) if vis: env.render() done = False total_reward = 0 while not done: state = preprocess_batch([f1, f2]) pi, _ = model(state) dist = Categorical(pi) f1, r1, done, _ = env.step(dist.sample().cpu().numpy()[0]) f2, r2, done, _ = env.step(dist.sample().cpu().numpy()[0]) reward = r1 + r2 if vis: env.render() total_reward += reward # print(total_reward) return total_reward
def collect_trajectories(envs, agent, tmax=200, nrand=5): ''' Collect trajectories of multiple agents of a parallelized environment ''' n = len(envs.ps) state_list = [] reward_list = [] prob_list = [] action_list = [] envs.reset() envs.step([1] * n) # Start all parallel agents # perform nrand random steps for _ in range(nrand): fr1, re1, _, _ = envs.step(np.random.choice([RIGHT, LEFT], n)) fr2, re2, _, _ = envs.step([0] * n) for t in range(tmax): batch_input = preprocess_batch([fr1, fr2]) probs = agent(batch_input).squeeze().cpu().detach().numpy() action = np.where(np.random.rand(n) < probs, RIGHT, LEFT) probs = np.where(action == RIGHT, probs, 1.0 - probs) fr1, re1, is_done, _ = envs.step(action) fr2, re2, is_done, _ = envs.step([0] * n) reward = re1 + re2 # Collect the result state_list.append(batch_input) reward_list.append(reward) prob_list.append(probs) action_list.append(action) # Stop if any of the trajectories is done if is_done.any(): break return prob_list, state_list, action_list, reward_list
def test_env(vis=False): # print("testtest") f1 = env.reset() f2, _, _, _ = env.step(env.action_space.sample()) if vis: env.render() done = False total_reward = 0 while not done: state = preprocess_batch([f1, f2]) # print(state.shape) pi, _ = model(state) # print(pi) action = Categorical(pi).sample().cpu().numpy() # print(action) f1, r1, done, _ = env.step(action) f2, r2, done, _ = env.step(action) reward = r1 + r2 if vis: env.render() total_reward += reward print(total_reward) return total_reward
early_stop = False while not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] next_states = [] for _ in range(num_steps ): #경험 모으기 - gpu 쓰는구나 . 하지만 여전히 DataParallel 들어갈 여지는 없어보인다. #-> 아.. state 가 벡터 1개가 아닐 것 같다.. 16개네. gpu 쓸만하다. DataParallel 도 가능할듯? state = preprocess_batch(state) dist, value = model(state) # print(value, value.shape , "value") # m = Categorical(dist) action = dist.sample() next_state, reward, done, _ = envs.step( action.cpu().numpy()) #done 한다고 끝내질 않네?? # logging.warning(f'dist[action] : {dist[action]}') # print(action) log_prob = dist.log_prob(action) #torch.log(dist[action]) log_probs.append(log_prob) values.append(value)