Ejemplo n.º 1
0
def main(env_name, render=False):
    env = gym.make(env_name)
    # Inicialize seu agente aqui
    agent = RandomAgent(env)
    for episode_i in range(100000):
        state = env.reset()
        done = False
        while not done:
            if render and episode_i % 10 == 0:
                env.render()
            # Ação do seu agente aqui
            action = agent.act(state)
            state, reward, done, info = env.step(action)
Ejemplo n.º 2
0
import time
import gym

from agent import RandomAgent

env = gym.make("CartPole-v1")

agent = RandomAgent(env.action_space)

episode_count = 10
reward = 0
done = False

for i in range(episode_count):
    ob = env.reset()
    while True:
        action = agent.act(ob, reward, done)
        ob, reward, done, info = env.step(action)
        if done:
            print("Game Finished!")
            break
        env.render()
        time.sleep(1 / 30)
    env.close()
Ejemplo n.º 3
0
    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)

    env = MazeEnv(args, args.game_name, args.graph_param, args.game_len,
                  args.gamma)

    # agent
    if args.agent == 'random':
        agent = RandomAgent(args, env)

    NUM_GRAPH = 100
    NUM_ITER = 32
    ep_rews = []
    for graph_id in range(NUM_GRAPH):
        for _ in range(NUM_ITER):
            ep_rew = 0
            state, info = env.reset(graph_index=graph_id)
            done = False
            while not done:
                action = agent.act(state)
                state, rew, done, info = env.step(action)
                ep_rew += rew
            ep_rews.append(ep_rew)

        string = 'Graph={:02d}/{:02d}, Return={:.4f}'
        print(string.format(graph_id, NUM_GRAPH, sum(ep_rews) / len(ep_rews)))

    print('Avg. Ep Return={:.4f}'.format(sum(ep_rews) / len(ep_rews)))
    print('This should be around 0.0455')
Ejemplo n.º 4
0
def challenger_round():
    challengers = []
    leaders = []
    leader_checkpoints = os.listdir(LEADER_DIR)
    # Need to share the same schedule with all challengers, so they all anneal
    # at same rate
    epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES)
    for i in xrange(NUM_LEADERS):
        challenger = try_gpu(
            DQNAgent(6,
                     epsilon_schedule,
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM))
        if i < len(leader_checkpoints):
            leader = try_gpu(
                DQNAgent(6, LinearSchedule(0.1, 0.1, 500000),
                         OBSERVATION_MODE))
            leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i])
            print "LOADING CHECKPOINT: {}".format(leader_path)
            challenger.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
            leader.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
        else:
            leader = RandomAgent(6)
            print "INITIALIZING NEW CHALLENGER AND LEADER"
        challengers.append(challenger)
        leaders.append(leader)

    if CHALLENGER_DIR is not None:
        challengers = []
        # Load in all of the leaders
        for checkpoint in os.listdir(CHALLENGER_DIR):
            path = os.path.join(CHALLENGER_DIR, checkpoint)
            print "LOADING FROM CHALLENGER_DIR: {}".format(path)
            challenger = try_gpu(
                DQNAgent(6,
                         LinearSchedule(0.05, 0.05, 1),
                         CHALLENGER_OBSERVATION_MODE,
                         lr=LR,
                         max_grad_norm=GRAD_CLIP_NORM,
                         name=checkpoint))
            challenger.load_state_dict(
                torch.load(path, map_location=lambda storage, loc: storage))
            challengers.append(challenger)

    challenger = EnsembleDQNAgent(challengers)
    leader = EnsembleDQNAgent(leaders)
    if OPPONENT is not None or HUMAN:
        leader = NoOpAgent()
    replay_buffer = ReplayBuffer(1000000)
    rewards = collections.deque(maxlen=1000)
    frames = 0  # number of training frames seen
    episodes = 0  # number of training episodes that have been played
    with tqdm(total=TRAIN_FRAMES) as progress:
        # Each loop completes a single episode
        while frames < TRAIN_FRAMES:
            states = env.reset()
            challenger.reset()
            leader.reset()
            episode_reward = 0.
            episode_frames = 0
            # Each loop completes a single step, duplicates _evaluate() to
            # update at the appropriate frame #s
            for _ in xrange(MAX_EPISODE_LENGTH):
                frames += 1
                episode_frames += 1
                action1 = challenger.act(states[0])
                action2 = leader.act(states[1])
                next_states, reward, done = env.step(action1, action2)
                episode_reward += reward

                # NOTE: state and next_state are LazyFrames and must be
                # converted to np.arrays
                replay_buffer.add(
                    Experience(states[0], action1._action_index, reward,
                               next_states[0], done))
                states = next_states

                if len(replay_buffer) > 50000 and \
                        frames % 4 == 0:
                    experiences = replay_buffer.sample(32)
                    challenger.update_from_experiences(experiences)

                if frames % 10000 == 0:
                    challenger.sync_target()

                if frames % SAVE_FREQ == 0:
                    # TODO: Don't access internals
                    for agent in challenger._agents:
                        path = os.path.join(LEADER_DIR,
                                            agent.name + "-{}".format(frames))
                        print "SAVING CHECKPOINT TO: {}".format(path)
                        torch.save(agent.state_dict(), path)
                    #path = os.path.join(
                    #        LEADER_DIR, challenger.name + "-{}".format(frames))
                    #torch.save(challenger.state_dict(), path)

                if frames >= TRAIN_FRAMES:
                    break

                if done:
                    break

            if episodes % 300 == 0:
                print "Evaluation: {}".format(
                    evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN))
            print "Episode reward: {}".format(episode_reward)
            episodes += 1
            rewards.append(episode_reward)
            stats = challenger.stats
            stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards)
            stats["Num Episodes"] = episodes
            stats["Replay Buffer Size"] = len(replay_buffer)
            progress.set_postfix(stats, refresh=False)
            progress.update(episode_frames)
            episode_frames = 0
Ejemplo n.º 5
0
possible_actions = [0, 1]  # Cooperate or Defect
cooperator, defector = RandomAgent(possible_actions, p=0.9), RandomAgent(possible_actions, p=0.1)

# Stateless interactions (agents do not have memory)
s = None

n_iter = 1000
for i in range(n_iter):

    # A full episode:
    done = False

    while not done:

        # Agents decide
        a0 = cooperator.act()
        a1 = defector.act()

        # World changes
        new_s, (r0, r1), done, _ = env.step(([a0], [a1]))

        # Agents learn
        cooperator.update(s, (a0, a1), (r0, r1), new_s )
        defector.update(s, (a1, a0), (r1, r0), new_s )

        s = new_s
        print(r0, r1)

    env.reset()