def displayImage(image, step, reward, value): clear_output(True) title = "step " + str(step) + " reward: " + str(reward) + " Value: " + str(value[0][0]) plt.title(title) im.set_data(image) fig.canvas.draw() plt.pause(0.1) # init environment env = MiniPacman(mode=mode, frame_cap=1000) # load model agentPath = "actor_critic_pacman_" + mode actor_critic = ActorCritic(env.observation_space.shape, env.action_space.n) pretrained_dict = torch.load(agentPath) actor_critic.load_state_dict(pretrained_dict) if USE_CUDA: actor_critic = actor_critic.cuda() # init game done = False state = env.reset() total_reward = 0 step = 1 #while not done:
state = envs.reset() state = FloatTensor(state) state_shape = envs.observation_space.shape writer = new_writer(LABEL, arg) state_shape = envs.observation_space.shape num_actions = envs.action_space.n num_rewards = len(MODE_REWARDS[arg.mode]) env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards=5) if USE_CUDA: env_model.cuda() distill_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n) distill_optimizer = optim.Adam(distill_policy.parameters()) ei_i2a = EnvIntegrated_I2A(state_shape, num_actions, hidden_size=256, full_rollout=True, env_model=env_model, mode_reward=MODE_REWARDS[mode]) imagination = ImaginationCore(arg.rollout_depth, state_shape, num_actions, num_rewards, ei_i2a.env_model, distill_policy, full_rollout=True)
#a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e6) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 #Init a2c and rmsprop agent1 = ActorCritic(state_shape, num_actions) agent2 = ActorCritic(state_shape, num_actions) optimizer1 = optim.RMSprop(agent1.parameters(), lr, eps=eps, alpha=alpha) optimizer2 = optim.RMSprop(agent2.parameters(), lr, eps=eps, alpha=alpha) agent1 = make_cuda(agent1) agent2 = make_cuda(agent2) rollout1 = RolloutStorage(num_steps, num_envs, state_shape) rollout2 = RolloutStorage(num_steps, num_envs, state_shape) if USE_CUDA: rollout1.cuda() rollout2.cuda() all_rewards1 = []
envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 10 num_frames = int(1e6) #Init a2c and rmsprop actor_critic = ActorCritic(state_shape, num_actions) optimizer = optim.Adam(actor_critic.parameters()) actor_critic = make_cuda(actor_critic) rollout = RolloutStorage(num_steps, num_envs, state_shape) if USE_CUDA: rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state))
#a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(1e6) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 #Init a2c and rmsprop actor_critic = ActorCritic(state_shape, num_actions) optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) if USE_CUDA: actor_critic = actor_critic.cuda() rollout = RolloutStorage(num_steps, num_envs, state_shape) if USE_CUDA: rollout.cuda() all_rewards = [] all_losses = [] all_step_scores = []
num_steps = arg.num_steps # ステップ数 num_frames = int(arg.num_frames) # 訓練フレーム数 set_random_seed(global_seed) # グローバルのシード値 #rmsprop hyperparams: #lr = 7e-4 # default lr = arg.learning_rate eps = 1e-5 alpha = 0.99 envs = SubprocVecEnv([make_env(env_id, i) for i in range(num_envs)]) state = envs.reset() state = torch.FloatTensor(np.float32(state)).cuda() state_shape = envs.observation_space.shape #Init a2c and rmsprop actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) # Init rollout storage rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) if USE_CUDA: actor_critic = actor_critic.cuda() rollout.cuda() rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1)
shape = list(input.shape) shape[-1] = 80 shape[-2] = 80 upscaled = np.zeros((shape)) for ij in np.ndindex(upscaled.shape[-2:]): i,j=ij upscaled[...,i,j] = input[...,i//8,j//8] return upscaled if __name__ == '__main__': # init environment env = Key_Collect(max_steps=50, num_keys=num_keys) # load model actor_critic = ActorCritic((3,10,10), env.action_space.n) pretrained_dict = torch.load(agentPath, map_location='cpu') actor_critic.load_state_dict(pretrained_dict) actor_critic = make_cuda(actor_critic) # init game done = False state = env.reset() step = 1 total_reward = 0 while True: current_state = torch.FloatTensor(state)