예제 #1
0
def test_single_batch():
    """
    Test BatchedPlayer when the batch size is 1.
    """
    make_env = lambda: SimpleEnv(9, (1, 2, 3), 'float32')
    make_agent = lambda: SimpleModel((1, 2, 3), stateful=True)
    basic_player = BasicPlayer(make_env(), make_agent(), 3)
    batched_player = BatchedPlayer(batched_gym_env([make_env]), make_agent(),
                                   3)
    for _ in range(50):
        transes1 = basic_player.play()
        transes2 = batched_player.play()
        assert len(transes1) == len(transes2)
        for trans1, trans2 in zip(transes1, transes2):
            assert _transitions_equal(trans1, trans2)
예제 #2
0
def main():
    """
    Entry-point for the program.
    """
    env = gym.make('CartPole-v0')

    with tf.Session() as sess:
        make_net = lambda name: MLPQNetwork(sess,
                                            env.action_space.n,
                                            gym_space_vectorizer(
                                                env.observation_space),
                                            name,
                                            layer_sizes=[32])
        dqn = DQN(make_net('online'), make_net('target'))
        player = BasicPlayer(env,
                             EpsGreedyQNetwork(dqn.online_net, EPSILON),
                             batch_size=STEPS_PER_UPDATE)
        optimize = dqn.optimize(learning_rate=LEARNING_RATE)

        sess.run(tf.global_variables_initializer())

        dqn.train(num_steps=30000,
                  player=player,
                  replay_buffer=UniformReplayBuffer(BUFFER_SIZE),
                  optimize_op=optimize,
                  target_interval=200,
                  batch_size=64,
                  min_buffer_size=200,
                  handle_ep=lambda _, rew: print('got reward: ' + str(rew)))

    env.close()
예제 #3
0
def test_mixed_batch():
    """
    Test a batch with a bunch of different
    environments.
    """
    env_fns = [
        lambda s=seed: SimpleEnv(s, (1, 2, 3), 'float32')
        for seed in [3, 3, 3, 3, 3, 3]
    ]  #[5, 8, 1, 9, 3, 2]]
    make_agent = lambda: SimpleModel((1, 2, 3), stateful=True)
    for num_sub in [1, 2, 3]:
        batched_player = BatchedPlayer(
            batched_gym_env(env_fns, num_sub_batches=num_sub), make_agent(), 3)
        expected_eps = []
        for player in [
                BasicPlayer(env_fn(), make_agent(), 3) for env_fn in env_fns
        ]:
            transes = [t for _ in range(50) for t in player.play()]
            expected_eps.extend(_separate_episodes(transes))
        actual_transes = [t for _ in range(50) for t in batched_player.play()]
        actual_eps = _separate_episodes(actual_transes)
        assert len(expected_eps) == len(actual_eps)
        for episode in expected_eps:
            found = False
            for i, actual in enumerate(actual_eps):
                if _episodes_equivalent(episode, actual):
                    del actual_eps[i]
                    found = True
                    break
            assert found
예제 #4
0
def test_nstep_multi_step():
    """
    Test an NStepPlayer in the multi-step case.
    """
    make_env = lambda: SimpleEnv(9, (1, 2, 3), 'float32')
    make_agent = lambda: SimpleModel((1, 2, 3), stateful=True)
    make_basic = lambda: BasicPlayer(make_env(), make_agent(), batch_size=1)
    player1 = make_basic()
    player2 = NStepPlayer(make_basic(), 3)
    raw_trans = [t for _ in range(40) for t in player1.play()]
    nstep_trans = [t for _ in range(40) for t in player2.play()]
    for raw, multi in zip(raw_trans, nstep_trans):
        for key in ['episode_step', 'episode_id', 'is_last']:
            assert raw[key] == multi[key]
        assert np.allclose(raw['model_outs']['actions'][0],
                           multi['model_outs']['actions'][0])
        assert np.allclose(raw['obs'], multi['obs'])
        assert raw['rewards'] == multi['rewards'][:1]
        assert raw['total_reward'] + sum(
            multi['rewards'][1:]) == multi['total_reward']
    for raw, multi in zip(raw_trans[3:], nstep_trans):
        if multi['new_obs'] is not None:
            assert np.allclose(multi['new_obs'], raw['obs'])
        else:
            assert multi['episode_id'] != raw['episode_id']
예제 #5
0
 def _gather_transitions(batch_size):
     player = NStepPlayer(
         BasicPlayer(make_env(), make_agent(), batch_size=batch_size), 3)
     transitions = []
     while len(transitions) < 50:
         transitions.extend(player.play())
     # The NStepPlayer is not required to preserve
     # the order of transitions.
     return sorted(transitions,
                   key=lambda t: (t['episode_id'], t['episode_step']))[:50]
예제 #6
0
def test_nstep_one_step():
    """
    Test an NStepPlayer in the trivial, 1-step case.
    """
    make_env = lambda: SimpleEnv(15, (1, 2, 3), 'float32')
    make_agent = lambda: SimpleModel((1, 2, 3), stateful=True)
    make_basic = lambda: BasicPlayer(make_env(), make_agent(), batch_size=3)
    player1 = make_basic()
    player2 = NStepPlayer(make_basic(), 1)
    for _ in range(100):
        transes1 = player1.play()
        transes2 = player2.play()
        assert len(transes1) == len(transes2)
        for trans1, trans2 in zip(transes1, transes2):
            assert _transitions_equal(trans1, trans2)
예제 #7
0
 def finish(self, sess, dqn, optimize=True):
     eps_decay_sched = TFScheduleValue(
         sess,
         LinearTFSchedule(self.args['exploration_timesteps'],
                          self.args['initial_epsilon'],
                          self.args['final_epsilon'])
     ) if self.args['epsilon_decay'] else self.args['epsilon']
     return {
         "player":
         BasicPlayer(self.env,
                     EpsGreedyQNetwork(dqn.online_net, eps_decay_sched)),
         "optimize_op":
         dqn.optimize(learning_rate=self.args['learning_rate'])
         if optimize else None,
         "replay_buffer":
         UniformReplayBuffer(1000),
     }
예제 #8
0
def main():
    if local_env:  # Select Random Level if local
        levels = ['SpringYardZone.Act3',
                  'SpringYardZone.Act2',
                  'GreenHillZone.Act3',
                  'GreenHillZone.Act1',
                  'StarLightZone.Act2',
                  'StarLightZone.Act1',
                  'MarbleZone.Act2',
                  'MarbleZone.Act1',
                  'MarbleZone.Act3',
                  'ScrapBrainZone.Act2',
                  'LabyrinthZone.Act2',
                  'LabyrinthZone.Act1',
                  'LabyrinthZone.Act3']
        level_choice = random.randrange(0, 13, 1)
        env = make_env(stack=True, scale_rew=False, local=local_env, level_choice=level_choice) #-3
    else:
        print('connecting to remote environment')
        env = grc.RemoteEnv('tmp/sock')
        print('starting episode')

    env = AllowBacktracking(env)

    solutions = env.solutions  # Track Solutions
    state_size = env.observation_space
    action_size = env.action_space.n
    print(state_size, action_size)
    env.assist = False
    env.trainer = train  # Begin with mentor led exploration
    env.reset()

    while env.total_steps_ever <= TOTAL_TIMESTEPS:  # Interact with Retro environment until Total TimeSteps expire.
        while env.trainer:
            print('Entering Self Play')
            keys = getch()
            if keys == 'A':
                env.control(-1)
            if keys == 'B':
                env.control(4)
            if keys == 'C':
                env.control(3)
            if keys == 'D':
                env.control(2)
                buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
                actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                           ['DOWN', 'B'], ['B']]
            if keys == 'rr':
                env.trainer = False
                continue
            if keys == ' ':
                env.close()
                env = make_env(stack=False, scale_rew=False, local=local_env)
                env = AllowBacktracking(env)
                env.reset()  # Initialize Gaming Environment
                env.trainer = True

        if env.episode % RL_PLAY_PCT == 0:

            tf.reset_default_graph()
            with tf.Session() as sess:
                def make_net(name):
                    return MLPQNetwork(sess,
                                       env.action_space.n,
                                       gym_space_vectorizer(env.observation_space),
                                       name,
                                       layer_sizes=[32])

                dqn = DQN(make_net('online'), make_net('target'))
                bplayer = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON),
                                     batch_size=STEPS_PER_UPDATE)
                optimize = dqn.optimize(learning_rate=LEARNING_RATE)

                sess.run(tf.global_variables_initializer())

                env.agent = 'DQN'
                dqn.train(num_steps=TRAINING_STEPS,
                          player=bplayer,
                          replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                          optimize_op=optimize,
                          target_interval=200,
                          batch_size=64,
                          min_buffer_size=200,
                          handle_ep=lambda _, rew: print('Exited DQN with : ' + str(rew) + str(env.steps)))

        new_ep = True  # New Episode Flag
        while new_ep:
            if new_ep:
                if (solutions and
                        random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS):
                    new_state, new_rew, done = env.spawn()
                    continue
                else:
                    env.reset()
                    new_ep = False
            env.agent = 'JERK'
            rew, new_ep = move(env, 100)
            if not new_ep and rew <= 0:
                #print('backtracking due to negative reward: %f' % rew)
                _, new_ep = move(env, 70, left=True)
            if new_ep:
                solutions.append(([max(env.reward_history)], env.best_sequence()))
예제 #9
0
def main():
    if local_env:  # Select Random Level if local
        from retro_contest.local import make
        levels = [
            'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3',
            'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1',
            'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3',
            'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1',
            'LabyrinthZone.Act3'
        ]
        level_choice = levels[random.randrange(0, 13, 1)]
        env = make(game='SonicTheHedgehog-Genesis', state=level_choice)
    else:
        print('connecting to remote environment')
        env = grc.RemoteEnv('tmp/sock')
        print('starting episode')

    env = TrackedEnv(env)

    solutions = env.solutions  # Track Solutions
    state_size = env.observation_space
    action_size = env.action_space.n
    print(state_size, action_size)
    env.assist = False
    env.trainer = False  # Begin with mentor led exploration
    env.resume_rl(True)  # Begin with RL exploration
    env.reset()

    while env.total_steps_ever <= TOTAL_TIMESTEPS:  # Interact with Retro environment until Total TimeSteps expire.
        while env.trainer:
            print('Entering Self Play')
            keys = getch()
            if keys == 'A':
                env.control(-1)
            if keys == 'B':
                env.control(4)
            if keys == 'C':
                env.control(3)
            if keys == 'D':
                env.control(2)
            if keys == 'rr':
                env.trainer = False
                continue
            if keys == ' ':
                env.close()
                env = make(game='SonicTheHedgehog-Genesis',
                           state=levels[random.randrange(0, 13, 1)])
                env = TrackedEnv(env)
                env.reset()  # Initialize Gaming Environment
                env.trainer = True
            if env.steps > 1:
                print('Prev Rew', env.step_rew_history[-1], 'Curr_Loc',
                      env.reward_history[-1], 'Med Rew',
                      np.median(env.step_rew_history[-3:]))

        if env.episode % RL_PLAY_PCT == 0:

            tf.reset_default_graph()
            with tf.Session() as sess:

                def make_net(name):
                    return MLPQNetwork(sess,
                                       env.action_space.n,
                                       gym_space_vectorizer(
                                           env.observation_space),
                                       name,
                                       layer_sizes=[32])

                dqn = DQN(make_net('online'), make_net('target'))
                bplayer = BasicPlayer(env,
                                      EpsGreedyQNetwork(
                                          dqn.online_net, EPSILON),
                                      batch_size=STEPS_PER_UPDATE)
                optimize = dqn.optimize(learning_rate=LEARNING_RATE)

                sess.run(tf.global_variables_initializer())

                env.agent = 'DQN'
                dqn.train(
                    num_steps=TRAINING_STEPS,
                    player=bplayer,
                    replay_buffer=PrioritizedReplayBuffer(500000,
                                                          0.5,
                                                          0.4,
                                                          epsilon=0.1),
                    optimize_op=optimize,
                    target_interval=200,
                    batch_size=64,
                    min_buffer_size=200,
                    handle_ep=lambda _, rew: print('Exited DQN with : ' + str(
                        rew) + str(env.steps)))

        new_ep = True  # New Episode Flag
        while new_ep:
            if new_ep:
                if (solutions and random.random() <
                        EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS):
                    solutions = sorted(solutions, key=lambda x: np.mean(x[0]))
                    best_pair = solutions[-1]
                    new_rew = exploit(env, best_pair[1])
                    best_pair[0].append(new_rew)
                    print('replayed best with reward %f' % new_rew)
                    print(best_pair[0])
                    continue
                else:
                    env.reset()
                    new_ep = False
            env.agent = 'JERK'
            rew, new_ep = move(env, 100)
            if not new_ep and rew <= 0:
                #print('backtracking due to negative reward: %f' % rew)
                _, new_ep = move(env, 70, left=True)
            if new_ep:
                solutions.append(
                    ([max(env.reward_history)], env.best_sequence()))
예제 #10
0
 def make_basic():
     return BasicPlayer(make_env(), make_agent(), batch_size=1)