def main(): agt = PPOAgent() agent_list = [agt, DullAgent(), DullAgent(), DullAgent()] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # Run the episodes just like OpenAI Gym total_time = 0 for i_episode in range(1000000): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) total_time += 1 print('Episode {} finished'.format(i_episode)) env.close()
roll, pitch, yaw, \ hipx_a_pos, hipx_b_pos, hipx_c_pos, hipx_d_pos, \ hipx_a_vel, hipx_b_vel, hipx_c_vel, hipx_d_vel, \ hipy_a_pos, hipy_b_pos, hipy_c_pos, hipy_d_pos, \ hipy_a_vel, hipy_b_vel, hipy_c_vel, hipy_d_vel, \ leg_a_pos, leg_b_pos, leg_c_pos, leg_d_pos, \ leg_a_vel, leg_b_vel, leg_c_vel, leg_d_vel, \ x, h, y = message if LOG: with open("data.txt", mode="a") as csv_file: csv_writer = csv.writer(csv_file, delimiter=',') csv_writer.writerow(message) supervisor = ddpg_controller() agent = PPOAgent(supervisor.observationSpace, supervisor.actionSpace) agent.load("") solved = False # Run outer loop until the episodes limit is reached or the task is solved while not solved and supervisor.episodeCount < supervisor.episodeLimit: observation = supervisor.reset( ) # Reset robot and get starting observation supervisor.episodeScore = 0 for step in range(supervisor.stepPerEpisode): #print(step) # In training mode the agent samples from the probability distribution, naturally implementing exploration selectedAction, actionProb = agent.work(observation, type_="selectAction") # Step the supervisor to get the current selectedAction's reward, the new observation and whether we reached
def convert_observation_to_input(obs): return torch.FloatTensor(obs) controller = PPOAgent( action_space=envs.action_space, size_obs=size_obs, shape_pic=None, size_layers=[256], size_cnn_output=2, actor_lr=1e-3, critic_lr=1e-3, value_loss_coeff=1., gamma=0.99, gae_lambda=0.95, epochs=4, horizon=32, mini_batch_size=8, frames_per_action=1, init_wait=1, clip=0.2, entropy_coeff=0.01, log_std=0., use_parallel=True, num_parallel=8, logs=True, ) signal.signal(signal.SIGINT, signal.default_int_handler) state = envs.reset()
from PPOAgent import PPOAgent import torch torch.set_default_tensor_type(torch.cuda.FloatTensor) agent = PPOAgent() for n in range(1000): agent.train_step()
def main(args): global human_agent_action, img_size, frame, score, scores, skip_frame_rate, pause_seconds, mode # set environment env = gym.make(args.atari_game) env.render() # set key listener env.unwrapped.viewer.window.on_key_press = key_press env.unwrapped.viewer.window.on_key_release = key_release print("Press keys w a s d or arrow-keys to move") print("Press space to shoot") print("No keys pressed is taking action 0 --> no action") print("\nGood Luck!") input_shape = (args.skip_frame_rate, 84, 84) # formated image discount_factor = args.discount_factor minibatch_size = args.minibatch_size replay_memory_size = args.replay_memory_size img_size = (84, 84) skip_frame_rate = args.skip_frame_rate pause_seconds = args.pause_gap mode = args.mode logger = Logger(args.atari_game, "data/%s/log/" % mode) replay_buffer = ReplayBuffer(replay_memory_size, minibatch_size) if mode == 'dqn': print("Using DQN agent") agent = DQNAgent(input_shape, env.action_space.n, discount_factor, replay_buffer, minibatch_size, logger) elif mode == 'cnn': print("Using CNN agent") agent = CNNAgent(input_shape, env.action_space.n, replay_buffer, minibatch_size, logger) elif mode == 'ppo': print("Using PPO agent") agent = PPOAgent(input_shape, env.action_space.n, discount_factor, replay_buffer, minibatch_size, logger) else: print("Using DDQN agent") agent = DDQNAgent(input_shape, env.action_space.n, discount_factor, replay_buffer, minibatch_size, logger) agent.load_model(rollout=logger.get_rollouts()) if logger.get_rollouts() != 0: agent.set_rollout(logger.get_rollouts() + 1) start = logger.get_rollouts() + 1 else: start = 0 max_episodes = args.max_episodes print("previous rollouts: %d" % start) # start algorithm for episode in range(start, max_episodes): obs = preprocess_observation(env.reset(), img_size) current_state = np.maximum(obs, obs) replay_buffer.add_experience(current_state, 0, 0, False, initial=True) if mode == 'ppo': agent.add_experience(False, 0, 0) frame = 0 score = 0 # get agent action until not confident enough agent_act(agent, env, current_state, replay_buffer) # request fpr expert print("Need Expert Demonstration in %d seconds!" % pause_seconds) sec = args.pause_gap # if episode > 0 and episode % 20 == 0: # sec = 600 while pause_seconds > 0: time.sleep(1) pause_seconds -= 1 print(pause_seconds) print("Begin!") pause_seconds = sec # get expert actions until we are done for i in range(0, args.max_expert_rollouts): if i > 0: score = 0 frame = 0 obs = preprocess_observation(env.reset(), img_size) current_state = np.maximum(obs, obs) human_expert_act(replay_buffer, env, current_state, logger, agent) evaluate_scores(logger) # train additional experience window_still_open = env.render() if window_still_open: if mode == 'ppo': agent.add_q_value() agent.train(train_all=True)
def __init__(self): self.env = gym.make("CartPole-v0") self.agent = PPOAgent()
g = rewards[t] + gamma*g returns[t] = g trajectory['returns'] = returns seed = 0 env = PPOEnv() np.random.seed(seed) tf.set_random_seed(seed) env.seed(seed=seed) obs_dim = env.observation_space.shape[0] n_act = 7 #config: act_dim #env.action_space.n agent = PPOAgent(obs_dim, n_act, epochs=10, hdim=64, lr=3e-4, max_std=1.0, clip_range=0.3, seed=seed) avg_return_list = deque(maxlen=10) avg_loss_list = deque(maxlen=10) episode_size = 1 batch_size = 64 nupdates = 600 for update in range(nupdates+1): trajectories = run_policy(env, agent, episode_size) compute_returns(trajectories) observes, actions, returns = build_train_set(trajectories) pol_loss, kl, entropy = agent.update(observes, actions, returns, batch_size=batch_size)