コード例 #1
0
ファイル: PPOGame.py プロジェクト: jault/pommerman
def main():
    agt = PPOAgent()

    agent_list = [agt, DullAgent(), DullAgent(), DullAgent()]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    total_time = 0
    for i_episode in range(1000000):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            total_time += 1
        print('Episode {} finished'.format(i_episode))
    env.close()
コード例 #2
0
        roll, pitch, yaw, \
        hipx_a_pos, hipx_b_pos, hipx_c_pos, hipx_d_pos, \
        hipx_a_vel, hipx_b_vel, hipx_c_vel, hipx_d_vel, \
        hipy_a_pos, hipy_b_pos, hipy_c_pos, hipy_d_pos, \
        hipy_a_vel, hipy_b_vel, hipy_c_vel, hipy_d_vel, \
        leg_a_pos, leg_b_pos, leg_c_pos, leg_d_pos, \
        leg_a_vel, leg_b_vel, leg_c_vel, leg_d_vel, \
        x, h, y = message
        if LOG:
            with open("data.txt", mode="a") as csv_file:
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow(message)


supervisor = ddpg_controller()
agent = PPOAgent(supervisor.observationSpace, supervisor.actionSpace)
agent.load("")
solved = False

# Run outer loop until the episodes limit is reached or the task is solved
while not solved and supervisor.episodeCount < supervisor.episodeLimit:
    observation = supervisor.reset(
    )  # Reset robot and get starting observation
    supervisor.episodeScore = 0
    for step in range(supervisor.stepPerEpisode):
        #print(step)
        # In training mode the agent samples from the probability distribution, naturally implementing exploration
        selectedAction, actionProb = agent.work(observation,
                                                type_="selectAction")

        # Step the supervisor to get the current selectedAction's reward, the new observation and whether we reached
コード例 #3
0
def convert_observation_to_input(obs):
    return torch.FloatTensor(obs)


controller = PPOAgent(
    action_space=envs.action_space,
    size_obs=size_obs,
    shape_pic=None,
    size_layers=[256],
    size_cnn_output=2,
    actor_lr=1e-3,
    critic_lr=1e-3,
    value_loss_coeff=1.,
    gamma=0.99,
    gae_lambda=0.95,
    epochs=4,
    horizon=32,
    mini_batch_size=8,
    frames_per_action=1,
    init_wait=1,
    clip=0.2,
    entropy_coeff=0.01,
    log_std=0.,
    use_parallel=True,
    num_parallel=8,
    logs=True,
)

signal.signal(signal.SIGINT, signal.default_int_handler)

state = envs.reset()
コード例 #4
0
from PPOAgent import PPOAgent
import torch

torch.set_default_tensor_type(torch.cuda.FloatTensor)

agent = PPOAgent()
for n in range(1000):
    agent.train_step()


コード例 #5
0
ファイル: main.py プロジェクト: nefl1011/ImitationLearning
def main(args):
    global human_agent_action, img_size, frame, score, scores, skip_frame_rate, pause_seconds, mode

    # set environment
    env = gym.make(args.atari_game)
    env.render()

    # set key listener
    env.unwrapped.viewer.window.on_key_press = key_press
    env.unwrapped.viewer.window.on_key_release = key_release

    print("Press keys w a s d or arrow-keys to move")
    print("Press space to shoot")
    print("No keys pressed is taking action 0 --> no action")
    print("\nGood Luck!")

    input_shape = (args.skip_frame_rate, 84, 84)  # formated image
    discount_factor = args.discount_factor
    minibatch_size = args.minibatch_size
    replay_memory_size = args.replay_memory_size
    img_size = (84, 84)
    skip_frame_rate = args.skip_frame_rate
    pause_seconds = args.pause_gap
    mode = args.mode

    logger = Logger(args.atari_game, "data/%s/log/" % mode)
    replay_buffer = ReplayBuffer(replay_memory_size, minibatch_size)

    if mode == 'dqn':
        print("Using DQN agent")
        agent = DQNAgent(input_shape,
                         env.action_space.n,
                         discount_factor,
                         replay_buffer,
                         minibatch_size,
                         logger)
    elif mode == 'cnn':
        print("Using CNN agent")
        agent = CNNAgent(input_shape,
                         env.action_space.n,
                         replay_buffer,
                         minibatch_size,
                         logger)
    elif mode == 'ppo':
        print("Using PPO agent")
        agent = PPOAgent(input_shape,
                         env.action_space.n,
                         discount_factor,
                         replay_buffer,
                         minibatch_size,
                         logger)
    else:
        print("Using DDQN agent")
        agent = DDQNAgent(input_shape,
                          env.action_space.n,
                          discount_factor,
                          replay_buffer,
                          minibatch_size,
                          logger)

    agent.load_model(rollout=logger.get_rollouts())
    if logger.get_rollouts() != 0:
        agent.set_rollout(logger.get_rollouts() + 1)
        start = logger.get_rollouts() + 1
    else:
        start = 0
    max_episodes = args.max_episodes
    print("previous rollouts: %d" % start)

    # start algorithm
    for episode in range(start, max_episodes):
        obs = preprocess_observation(env.reset(), img_size)
        current_state = np.maximum(obs, obs)
        replay_buffer.add_experience(current_state, 0, 0, False, initial=True)
        if mode == 'ppo':
            agent.add_experience(False, 0, 0)
        frame = 0
        score = 0

        # get agent action until not confident enough
        agent_act(agent, env, current_state, replay_buffer)

        # request fpr expert
        print("Need Expert Demonstration in %d seconds!" % pause_seconds)
        sec = args.pause_gap
        # if episode > 0 and episode % 20 == 0:
            # sec = 600
        while pause_seconds > 0:
            time.sleep(1)
            pause_seconds -= 1
            print(pause_seconds)
        print("Begin!")
        pause_seconds = sec

        # get expert actions until we are done
        for i in range(0, args.max_expert_rollouts):
            if i > 0:
                score = 0
                frame = 0
                obs = preprocess_observation(env.reset(), img_size)
                current_state = np.maximum(obs, obs)

            human_expert_act(replay_buffer, env, current_state, logger, agent)

            evaluate_scores(logger)

        # train additional experience
        window_still_open = env.render()
        if window_still_open:
            if mode == 'ppo':
                agent.add_q_value()
            agent.train(train_all=True)
コード例 #6
0
 def __init__(self):
     self.env = gym.make("CartPole-v0")
     self.agent = PPOAgent()
コード例 #7
0
ファイル: PPOMain.py プロジェクト: Geonhee-LEE/rl_sawyer
            g = rewards[t] + gamma*g
            returns[t] = g
        trajectory['returns'] = returns
        

seed = 0
env = PPOEnv()
np.random.seed(seed)
tf.set_random_seed(seed)
env.seed(seed=seed)

obs_dim = env.observation_space.shape[0]
n_act = 7 #config: act_dim #env.action_space.n

agent = PPOAgent(obs_dim, n_act, epochs=10,
                          hdim=64, lr=3e-4, max_std=1.0,
                          clip_range=0.3, seed=seed)

avg_return_list = deque(maxlen=10)
avg_loss_list = deque(maxlen=10)

episode_size = 1
batch_size = 64
nupdates = 600

for update in range(nupdates+1):
    trajectories = run_policy(env, agent, episode_size)
    compute_returns(trajectories)
    observes, actions, returns = build_train_set(trajectories)

    pol_loss, kl, entropy = agent.update(observes, actions, returns, batch_size=batch_size)