# init game done = False state = env.reset() total_reward = 0 step = 1 #while not done: while True: current_state = torch.FloatTensor(state).unsqueeze(0) if USE_CUDA: current_state = current_state.cuda() action = actor_critic.act(current_state) next_state, reward, done, _ = env.step(action.data[0,0]) total_reward += reward state = next_state _, value = actor_critic(current_state) value = value.data.cpu().numpy() image = torch.FloatTensor(state).permute(1,2,0).cpu().numpy() displayImage(image, step, total_reward, value) step += 1 if done: state = env.reset() step = 1 total_reward = 0
episode_rewards2 = torch.zeros(num_envs, 1) final_rewards2 = torch.zeros(num_envs, 1) timer.update(time.time()) swich_variable = 0 ##### training observation ###### traiobsenv = makeTrainingObservation() trainobs = traiobsenv.reset() ################################# for i_update in range(num_frames): for step in range(num_steps): # actor1 acts in all parallel envs action_p1 = agent1.act(make_cuda(state)).squeeze(1).cpu().numpy() # actor2 acts in all parallel envs action_p2 = agent2.act(make_cuda(state)).squeeze(1).cpu().numpy() # separate actions action_tuples = [] for i in range(num_envs): actions = [] actions.append(action_p1[i]) # player1 actions.append(action_p2[i]) # player2 action_tuples.append(actions) next_observation, reward, finished, _ = envs.step(action_tuples) # pass actions to environments # separate rewards
state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) timer.update(time.time()) for i_update in range(num_frames): for step in range(num_steps): action = actor_critic.act(make_cuda(state)) next_state, reward, finished, _ = envs.step(action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1) final_rewards *= finished_masks final_rewards += (1-finished_masks) * episode_rewards episode_rewards *= finished_masks finished_masks = make_cuda(finished_masks) state = torch.FloatTensor(np.float32(next_state))
state = torch.FloatTensor(np.float32(state)) if USE_CUDA: state = state.cuda() rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) timer.update(time.time()) for i_update in range(num_frames): for step in range(num_steps): action = actor_critic.act(state) next_state, reward, finished, _ = envs.step(action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1) final_rewards *= finished_masks final_rewards += (1-finished_masks) * episode_rewards episode_rewards *= finished_masks state = torch.FloatTensor(np.float32(next_state)) if USE_CUDA: finished_masks = finished_masks.cuda()
rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) if USE_CUDA: actor_critic = actor_critic.cuda() rollout.cuda() rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) writer = new_writer(LABEL, arg) for i_update in tqdm(range(num_frames)): for step in range(num_steps): action = actor_critic.act(state.cuda()) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = process_reward(reward, MODE_REWARDS[mode]) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if USE_CUDA: masks = masks.cuda()
pretrained_dict = torch.load(agentPath, map_location='cpu') actor_critic.load_state_dict(pretrained_dict) actor_critic = make_cuda(actor_critic) # init game done = False state = env.reset() step = 1 total_reward = 0 while True: current_state = torch.FloatTensor(state) action = actor_critic.act(make_cuda(current_state.unsqueeze(0))) next_state, reward, done, _ = env.step(action.data[0][0]) total_reward += reward state = next_state _, value = actor_critic(make_cuda(current_state.unsqueeze(0))) value = value.data.cpu().numpy() image = torch.FloatTensor(upscale(state)).permute(1,2,0).cpu().numpy() displayImage(image, step, total_reward, value) step += 1 if done: total_reward = 0 state = env.reset()