def adv_training(): ############## Hyperparameters ############## # creating environment env = TronEnv() state_dim = env.observation_space.shape[0] action_dim = 4 render = True solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = 10 # max training episodes max_timesteps = 300 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 2000 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) h_memory = Memory() a_memory = Memory() hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) hero.load() adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) adv.load() timestep = 0 # training loop for _ in range(1, max_episodes + 1): p_state, e_state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: p_action = hero.policy_old.act(np.array(p_state), h_memory) e_action = adv.policy_old.act(np.array(e_state), a_memory) p_state, _, _, h_done, _ = env.step(p_action, 0) e_state, _, _, a_done, _ = env.step(e_action, 1) # if render: # env.render() if h_done or a_done: env.render() break
def test(): torch.set_default_tensor_type('torch.DoubleTensor') ############## Hyperparameters ############## env_name = "Snake Game" # creating environment # tell env to initialize game board too env = SnakeGameGym(initBoard=True) state_dim = env.observation_space.shape[0] action_dim = 4 render = False max_timesteps = 500 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 1.00 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 3 max_timesteps = 300 render = True save_gif = False filename = "PPO_{}.pth".format(env_name) directory = "" memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy.load_state_dict(torch.load(directory + filename)) for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): print(t) ## So that it doesn't go too fast and goes at a normal snake game pace time.sleep(0.05) action = ppo.policy.act(state, memory) state, reward, done, _ = env.step(action) ep_reward += reward if render: print("Rendering") env.render() #if save_gif: # img = env.render(mode = 'rgb_array') # img = Image.fromarray(img) # img.save('./gif/{}.jpg'.format(t)) if done: print("Done") env.cleanup() break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0 env.close()
def test(): ############## Hyperparameters ############## env_name = "LunarLander-v2" # creating environment env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = 4 render = False max_timesteps = 500 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 3 max_timesteps = 300 render = True save_gif = False filename = "PPO_{}.pth".format(env_name) directory = "./preTrained/" memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(directory + filename)) for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): action = ppo.policy_old.act(state, memory) state, reward, done, _ = env.step(action) ep_reward += reward if render: img = env.render(mode='rgb_array') #print(screen.shape) img = Image.fromarray(img) #plt.ion() plt.imshow(img) #plt.show() ipythondisplay.clear_output(wait=True) ipythondisplay.display(plt.gcf()) if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0 env.close()
def test(env_name): ############## Hyperparameters ############## env = make_env(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] n_episodes = 3 # num of episodes to run max_timesteps = 1500 # max timesteps in one episode render = True # render the environment save_gif = True # png images are saved in gif folder # filename and directory to load model from filename = "PPO_continuous_" + env_name + ".pth" directory = "./preTrained/" action_std = 0.5 # constant std for action distribution (Multivariate Normal) K_epochs = 80 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) ############################################# memory = Memory() ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(directory + filename)) for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray((img * 255).astype(np.uint8)) img.save('./gif/' + env_name + '/{}.jpg'.format(t)) if done: break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0 env.close() createGif(env_name)
def test(): ############## Hyperparameters ############## # creating environment rm_ai = game() state_dim = rm_ai.state_num action_dim = rm_ai.action_num render = False max_timesteps = 500 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 3 max_timesteps = 300 render = True filename = "PPO_{}.pth".format("robomaster") memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(filename)) for ep in range(1, n_episodes + 1): ep_reward = 0 state = rm_ai.reset() for t in range(max_timesteps): action = ppo.policy_old.act(state, memory) print(action) state, reward, done, _ = rm_ai.step(action) ep_reward += reward if render: for event in pygame.event.get(): if event.type == pygame.QUIT: exit() if done: break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0
def test(): ############## Hyperparameters ############## # creating environment env = MyEnv() env_name = env.env_name action_dim = 5 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 100 max_timesteps = 5000 save_gif = False filename = "./preTrained/PPO_{}_train2.pth".format(env_name) memory = Memory() ppo = PPO(64*64*3, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(filename)) rewards = [] for ep in range(1, n_episodes+1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): obs, compass = converter(state) action = ppo.policy_old.act( obs=obs, compass=compass, memory=memory) state, reward, done, _ = env.step(action) ep_reward += reward # if render: # env.render() if save_gif: img = obs.data.numpy() img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break rewards.append(ep_reward) logging.debug('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) np.save('./PPO_ep_rewards_test_{}'.format(env_name), np.array(rewards))
def test(env): ############## Hyperparameters ############## # creating environment state_dim = 11 action_dim = 5 n_latent_var = 128 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 15 max_timesteps = 75 filename = "PPO_{}.pth".format('bitirmeindep') directory = "./" memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(directory+filename)) for ep in range(1, n_episodes+1): env.ResetUnity() state, _ , _ = env.GetState() episode_reward = 0 for t in range(max_timesteps): action = ppo.policy_old.act(state, memory) env.PostAction(action) state, reward, done = env.GetState() ep_reward += reward if done: break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0
def test(args): env = gym.make('GridExplore-v0') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") state_dim = env.observation_space[0].shape[0] action_dim = 5 render = args.render max_timesteps = 500 n_latent_var = 512 # number of variables in hidden layer lr = 0.001 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 2 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# filename = str(input("filename: ")) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(ppo.policy_old.state_dict) ppo.policy_old.load_state_dict(torch.load(filename, map_location=torch.device('cpu'))) avg=0 for i in range(10): s = env.reset() done_n = [False for _ in range(env.n_agents)] totalreward = 0 t= 0 while not all(done_n): t+=1 actions = [] env.render() if render: env.render_graphic() state=np.array([s]) state = torch.from_numpy(state).float().to(device) action = ppo.policy_old.act(state, memory) state, r, done_n, _ = env.step([action]) totalreward = totalreward + r time.sleep(0.01) if t > 500: break print("REWARDS: " , totalreward) avg += totalreward if render: env.render_graphic() env.render() env.close() print("AVG REWARD: " , avg/10)
def mptrain(args): ############## Hyperparameters ############## env_name = "GridExplore-v0" # creating environment env = gym.make(env_name) state_dim = env.observation_space[0].shape[0] action_dim = 5 model = ConvNet(action_dim).to(device) render = False solved_reward = 200 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = 500 # max training episodes max_timesteps = 500 # max timesteps in one episode n_latent_var = 128 # number of variables in hidden layer update_timestep = 600 # update policy every n timesteps lr = 1e-4 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None mini_batch_size = 32 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr,betas) # buffer = {key:value for key,value in memory.__dict__.items() if not key.startswith('__') and not callable(key)} num_processes = 4 multi_envs = [gym.make(env_name) for i in range(num_processes)] multi_mem = [] for i in range(num_processes): multi_mem.append(Memory()) # logging variables running_reward = 0 avg_length = 0 timestep = 0 writer = SummaryWriter("logs/" + time.strftime("%Y%m%d-%H%M%S")) q = queue() # training loop for i_episode in range(1, max_episodes+1): states = [multi_envs[i].reset() for i in range(num_processes)] for t in range(max_timesteps): timestep += 1 for k in range(num_processes): state = np.array([states[k]]) outputs = torch.from_numpy(state).float().to(device) # Running policy_old: action = ppo.policy_old.act(outputs, multi_mem[k]) state, reward, done, _ = multi_envs[k].step([action]) # Saving reward and is_terminal: multi_mem[k].rewards.append(reward) multi_mem[k].dones.append(done[0]) running_reward += reward[0] if done: states[k] = multi_envs[k].reset() avg = q.push(running_reward) # update if its time if timestep % update_timestep == 0: for k in range(num_processes): memory = multi_mem[k] # memory = multi_mem.flatten().tolist() ppo.update(memory) # for k in range(num_processes): multi_mem[k].clear_memory() timestep = 0 if render: env.render() if all(done): break avg_length += t running_reward /= num_processes avg = q.push(running_reward) # grid = torchvision.utils.make_grid(torch.tensor(env.grid)) # writer.add_image('images', grid, max_timesteps) # stop training if avg_reward > solved_reward if running_reward > (log_interval*solved_reward): print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) torch.save(ppo.policy.state_dict(), './savedmodels/PPO_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S"))) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length/log_interval) running_reward = int((running_reward/log_interval)) writer.add_scalar('episode/average_reward', avg, i_episode) print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 writer.close() torch.save(ppo.policy.state_dict(), './PPO_NOTSOLVED_{}.pth'.format(env_name)) torch.save(ppo.policy.state_dict(), './savedmodels/PPO_NOTSOLVED_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
action_constrain = [10, np.pi / 20] # parameter = [0.1,0.0009] # parameter = [0.0000001, 0.5] # pid = PID( parameter, env.width, env.height ) ### for plotting Reward = [] save_path = './PPO_out/' if not os.path.exists("./PPO_out"): os.makedirs("./PPO_out") """ start straightly """ evaluations = [] if args.policy_name == 'state': from PPO import PPO from PPO import Memory memory = Memory() policy = PPO(state_dim, action_dim, args.action_std, args.lr, args.betas, args.gamma, args.K_epochs, args.eps_clip) elif args.policy_name == 'rgb_array': from PPO_image import PPO from PPO_image import Memory memory = Memory() policy = PPO(img_stack, action_dim, args.action_std, args.lr, args.betas, args.gamma, args.K_epochs, args.eps_clip) env.total_timesteps = 0 timesteps_since_eval = 0 pid_assist = 0 time_step = 0 done = True
def main(): # ############## Hyperparameters ############## # env_name = "LunarLander-v2" # # creating environment # env = gym.make(env_name) # state_dim = env.observation_space.shape[0] # action_dim = 4 # render = 'render' in sys.argv # solved_reward = 230 # stop training if avg_reward > solved_reward # log_interval = 20 # print avg reward in the interval # max_episodes = 50000 # max training episodes # max_timesteps = 300 # max timesteps in one episode # n_latent_var = 64 # number of variables in hidden layer # update_timestep = 2000 # update policy every n timesteps # lr = 0.002 # betas = (0.9, 0.999) # gamma = 0.99 # discount factor # K_epochs = 4 # update policy for K epochs # eps_clip = 0.2 # clip parameter for PPO # random_seed = None # ############################################# ############## Hyperparameters ############## env_name = "SuperMarioBros-v3" # creating environment env = gym_super_mario_bros.make(env_name) state_dim = env.observation_space.shape[2] # print('state_dim:', state_dim) action_dim = 4 render = 'render' in sys.argv solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 1 # print avg reward in the interval max_episodes = 20 # max training episodes max_timesteps = 50 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 256 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: action = ppo.policy_old.act(state.copy(), memory) state, reward, done, _ = env.step(action.cpu()) # Saving reward and is_terminal: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: ppo.update(memory) memory.clear_memory() timestep = 0 running_reward += reward if render: env.render() if done: state = env.reset() avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './saved_models/PPO_{}.pth'.format(env_name)) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
def test(): ############## Hyperparameters ############## # env_name = "LunarLander-v2" # creating environment # env = gym.make(env_name) env = Scenario4() state_dim = env.observation_space.shape[0] action_dim = 2 render = args.visualize max_timesteps = 500 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 100 save_gif = False # filename = "PPO_{}.pth".format(env_name) test_name = args.case.lower() filename = './checkpoints/rl_checkpoint_' + test_name # directory = "./preTrained/" memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) nn_model = NN(5, 1) # nn_model.load_weights('./policies/data01_new_all_CoIL') scenario_name = args.scenario.lower() nn_model.load_weights('./policies/' + scenario_name + '_all_CoIL') ppo.policy_old.load_state_dict(torch.load(filename)) success_counter = 0 time_counter = 0 # env.T = 400 * env.dt - env.dt / 2. for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): mode = ppo.policy_old.act(state, memory) state = np.array(state).reshape(1, -1) action = nn_model(state, mode).numpy().reshape(-1) state, reward, done, terminal_time = env.step(action) print("mode is ", mode) if render: env.render() if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: ep_reward = reward if env.target_reached: time_counter += terminal_time success_counter += 1 break print("timesteps", t) print("success number", success_counter) print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) # ep_reward = 0 # env.close() print('Success Rate = ' + str(float(success_counter) / ep)) print('Mean reach time = ' + str(float(time_counter / success_counter)))
seed=0) schedule_generator = complex_schedule_generator() env = RailEnv(width=env_width, height=env_height, rail_generator=rail_generator, schedule_generator=schedule_generator, obs_builder_object=GlobalObsForRailEnv(), number_of_agents=agent_num) env_renderer = RenderTool(env) render = True render_sleep_time = 0.0 stuck_break_pont = 20 max_timesteps_in_episode = update_timestep ################################## memory = Memory() memory.init_vals(env.number_of_agents) ppo = PPO.FlatlandPPO(action_space=action_space, hidden_size=hidden_size, in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, agent_num=agent_num, num_epochs=num_epochs, rnn_num_layers=rnn_num_layers, gamma=gamma, learning_rate=learning_rate, clip_epsilon=clip_epsilon, c1=c1, c2=c2)
def adv_training(): ############## Hyperparameters ############## # creating environment env = TronEnv() state_dim = env.observation_space.shape[0] action_dim = 4 render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = 20000 # max training episodes max_timesteps = 300 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 2000 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) h_memory = Memory() a_memory = Memory() hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): p_state, e_state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: p_action = hero.policy_old.act(np.array(p_state), h_memory) e_action = adv.policy_old.act(np.array(e_state), a_memory) p_state, h_reward1, a_reward1, h_done, _ = env.step(p_action, 0) e_state, h_reward2, a_reward2, a_done, _ = env.step(e_action, 1) # Saving reward and is_terminal: h_memory.rewards.append(h_reward1 + h_reward2) h_memory.is_terminals.append(h_done) # Saving reward and is_terminal: a_memory.rewards.append(a_reward1 + a_reward2) a_memory.is_terminals.append(a_done) # update if its time if timestep % update_timestep == 0: hero.update(h_memory) h_memory.clear_memory() adv.update(a_memory) a_memory.clear_memory() timestep = 0 running_reward += (h_reward1 + h_reward2) if render: env.render() if h_done or a_done: break avg_length += t # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) env.render() print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 adv.save_all()