def test_single_batch(): """ Test BatchedPlayer when the batch size is 1. """ make_env = lambda: SimpleEnv(9, (1, 2, 3), 'float32') make_agent = lambda: SimpleModel((1, 2, 3), stateful=True) basic_player = BasicPlayer(make_env(), make_agent(), 3) batched_player = BatchedPlayer(batched_gym_env([make_env]), make_agent(), 3) for _ in range(50): transes1 = basic_player.play() transes2 = batched_player.play() assert len(transes1) == len(transes2) for trans1, trans2 in zip(transes1, transes2): assert _transitions_equal(trans1, trans2)
def main(): """ Entry-point for the program. """ env = gym.make('CartPole-v0') with tf.Session() as sess: make_net = lambda name: MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) player = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=30000, player=player, replay_buffer=UniformReplayBuffer(BUFFER_SIZE), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('got reward: ' + str(rew))) env.close()
def test_mixed_batch(): """ Test a batch with a bunch of different environments. """ env_fns = [ lambda s=seed: SimpleEnv(s, (1, 2, 3), 'float32') for seed in [3, 3, 3, 3, 3, 3] ] #[5, 8, 1, 9, 3, 2]] make_agent = lambda: SimpleModel((1, 2, 3), stateful=True) for num_sub in [1, 2, 3]: batched_player = BatchedPlayer( batched_gym_env(env_fns, num_sub_batches=num_sub), make_agent(), 3) expected_eps = [] for player in [ BasicPlayer(env_fn(), make_agent(), 3) for env_fn in env_fns ]: transes = [t for _ in range(50) for t in player.play()] expected_eps.extend(_separate_episodes(transes)) actual_transes = [t for _ in range(50) for t in batched_player.play()] actual_eps = _separate_episodes(actual_transes) assert len(expected_eps) == len(actual_eps) for episode in expected_eps: found = False for i, actual in enumerate(actual_eps): if _episodes_equivalent(episode, actual): del actual_eps[i] found = True break assert found
def test_nstep_multi_step(): """ Test an NStepPlayer in the multi-step case. """ make_env = lambda: SimpleEnv(9, (1, 2, 3), 'float32') make_agent = lambda: SimpleModel((1, 2, 3), stateful=True) make_basic = lambda: BasicPlayer(make_env(), make_agent(), batch_size=1) player1 = make_basic() player2 = NStepPlayer(make_basic(), 3) raw_trans = [t for _ in range(40) for t in player1.play()] nstep_trans = [t for _ in range(40) for t in player2.play()] for raw, multi in zip(raw_trans, nstep_trans): for key in ['episode_step', 'episode_id', 'is_last']: assert raw[key] == multi[key] assert np.allclose(raw['model_outs']['actions'][0], multi['model_outs']['actions'][0]) assert np.allclose(raw['obs'], multi['obs']) assert raw['rewards'] == multi['rewards'][:1] assert raw['total_reward'] + sum( multi['rewards'][1:]) == multi['total_reward'] for raw, multi in zip(raw_trans[3:], nstep_trans): if multi['new_obs'] is not None: assert np.allclose(multi['new_obs'], raw['obs']) else: assert multi['episode_id'] != raw['episode_id']
def _gather_transitions(batch_size): player = NStepPlayer( BasicPlayer(make_env(), make_agent(), batch_size=batch_size), 3) transitions = [] while len(transitions) < 50: transitions.extend(player.play()) # The NStepPlayer is not required to preserve # the order of transitions. return sorted(transitions, key=lambda t: (t['episode_id'], t['episode_step']))[:50]
def test_nstep_one_step(): """ Test an NStepPlayer in the trivial, 1-step case. """ make_env = lambda: SimpleEnv(15, (1, 2, 3), 'float32') make_agent = lambda: SimpleModel((1, 2, 3), stateful=True) make_basic = lambda: BasicPlayer(make_env(), make_agent(), batch_size=3) player1 = make_basic() player2 = NStepPlayer(make_basic(), 1) for _ in range(100): transes1 = player1.play() transes2 = player2.play() assert len(transes1) == len(transes2) for trans1, trans2 in zip(transes1, transes2): assert _transitions_equal(trans1, trans2)
def finish(self, sess, dqn, optimize=True): eps_decay_sched = TFScheduleValue( sess, LinearTFSchedule(self.args['exploration_timesteps'], self.args['initial_epsilon'], self.args['final_epsilon']) ) if self.args['epsilon_decay'] else self.args['epsilon'] return { "player": BasicPlayer(self.env, EpsGreedyQNetwork(dqn.online_net, eps_decay_sched)), "optimize_op": dqn.optimize(learning_rate=self.args['learning_rate']) if optimize else None, "replay_buffer": UniformReplayBuffer(1000), }
def main(): if local_env: # Select Random Level if local levels = ['SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3'] level_choice = random.randrange(0, 13, 1) env = make_env(stack=True, scale_rew=False, local=local_env, level_choice=level_choice) #-3 else: print('connecting to remote environment') env = grc.RemoteEnv('tmp/sock') print('starting episode') env = AllowBacktracking(env) solutions = env.solutions # Track Solutions state_size = env.observation_space action_size = env.action_space.n print(state_size, action_size) env.assist = False env.trainer = train # Begin with mentor led exploration env.reset() while env.total_steps_ever <= TOTAL_TIMESTEPS: # Interact with Retro environment until Total TimeSteps expire. while env.trainer: print('Entering Self Play') keys = getch() if keys == 'A': env.control(-1) if keys == 'B': env.control(4) if keys == 'C': env.control(3) if keys == 'D': env.control(2) buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], ['DOWN', 'B'], ['B']] if keys == 'rr': env.trainer = False continue if keys == ' ': env.close() env = make_env(stack=False, scale_rew=False, local=local_env) env = AllowBacktracking(env) env.reset() # Initialize Gaming Environment env.trainer = True if env.episode % RL_PLAY_PCT == 0: tf.reset_default_graph() with tf.Session() as sess: def make_net(name): return MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) bplayer = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) env.agent = 'DQN' dqn.train(num_steps=TRAINING_STEPS, player=bplayer, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('Exited DQN with : ' + str(rew) + str(env.steps))) new_ep = True # New Episode Flag while new_ep: if new_ep: if (solutions and random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS): new_state, new_rew, done = env.spawn() continue else: env.reset() new_ep = False env.agent = 'JERK' rew, new_ep = move(env, 100) if not new_ep and rew <= 0: #print('backtracking due to negative reward: %f' % rew) _, new_ep = move(env, 70, left=True) if new_ep: solutions.append(([max(env.reward_history)], env.best_sequence()))
def main(): if local_env: # Select Random Level if local from retro_contest.local import make levels = [ 'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3' ] level_choice = levels[random.randrange(0, 13, 1)] env = make(game='SonicTheHedgehog-Genesis', state=level_choice) else: print('connecting to remote environment') env = grc.RemoteEnv('tmp/sock') print('starting episode') env = TrackedEnv(env) solutions = env.solutions # Track Solutions state_size = env.observation_space action_size = env.action_space.n print(state_size, action_size) env.assist = False env.trainer = False # Begin with mentor led exploration env.resume_rl(True) # Begin with RL exploration env.reset() while env.total_steps_ever <= TOTAL_TIMESTEPS: # Interact with Retro environment until Total TimeSteps expire. while env.trainer: print('Entering Self Play') keys = getch() if keys == 'A': env.control(-1) if keys == 'B': env.control(4) if keys == 'C': env.control(3) if keys == 'D': env.control(2) if keys == 'rr': env.trainer = False continue if keys == ' ': env.close() env = make(game='SonicTheHedgehog-Genesis', state=levels[random.randrange(0, 13, 1)]) env = TrackedEnv(env) env.reset() # Initialize Gaming Environment env.trainer = True if env.steps > 1: print('Prev Rew', env.step_rew_history[-1], 'Curr_Loc', env.reward_history[-1], 'Med Rew', np.median(env.step_rew_history[-3:])) if env.episode % RL_PLAY_PCT == 0: tf.reset_default_graph() with tf.Session() as sess: def make_net(name): return MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) bplayer = BasicPlayer(env, EpsGreedyQNetwork( dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) env.agent = 'DQN' dqn.train( num_steps=TRAINING_STEPS, player=bplayer, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('Exited DQN with : ' + str( rew) + str(env.steps))) new_ep = True # New Episode Flag while new_ep: if new_ep: if (solutions and random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS): solutions = sorted(solutions, key=lambda x: np.mean(x[0])) best_pair = solutions[-1] new_rew = exploit(env, best_pair[1]) best_pair[0].append(new_rew) print('replayed best with reward %f' % new_rew) print(best_pair[0]) continue else: env.reset() new_ep = False env.agent = 'JERK' rew, new_ep = move(env, 100) if not new_ep and rew <= 0: #print('backtracking due to negative reward: %f' % rew) _, new_ep = move(env, 70, left=True) if new_ep: solutions.append( ([max(env.reward_history)], env.best_sequence()))
def make_basic(): return BasicPlayer(make_env(), make_agent(), batch_size=1)