def test_simple_importance_sampling(self): """ Test importance sampling when the buffer is never changed after the initial build-up. """ np.random.seed(1337) buf = PrioritizedReplayBuffer(capacity=10, alpha=1.5, beta=1.3, epsilon=0.5) for i in range(10): sample = { 'obs': 0, 'action': 0, 'reward': 0, 'new_obs': 0, 'steps': 1, 'idx': i } buf.add_sample(sample, init_weight=i) weights = np.power(np.arange(10).astype('float64') + 0.5, 1.5) weights /= np.sum(weights) weights = np.power(weights * len(weights), -1.3) weights /= np.max(weights) for i in range(1000): samples = buf.sample(3) for sample in samples: self.assertTrue( np.allclose(weights[sample['idx']], sample['weight']))
def test_prioritized_sampling(self): """ Test the buffer in a simple prioritized setting. """ np.random.seed(1337) buf = PrioritizedReplayBuffer(capacity=10, alpha=1.5, beta=1, epsilon=0.5) for i in range(10): sample = { 'obs': 0, 'action': 0, 'reward': 0, 'new_obs': 0, 'steps': 1, 'idx': i } buf.add_sample(sample, init_weight=i) sampled_idxs = [] for i in range(50000): for sample in buf.sample(3): sampled_idxs.append(sample['idx']) counts = Counter(sampled_idxs) probs = np.power(np.arange(10).astype('float64') + 0.5, 1.5) probs /= np.sum(probs) for i, prob in enumerate(probs): frac = counts[i] / len(sampled_idxs) self.assertGreater(frac, prob - 0.01) self.assertLess(frac, prob + 0.01)
def test_online_updates(): """ Test importance sampling for PrioritizedReplayBuffer when new samples and errors are inserted. """ buf = PrioritizedReplayBuffer(capacity=10, alpha=1.5, beta=0.5, epsilon=0.5) weights = [] def _random_weight(): return np.abs(np.random.normal()) def _add_sample(): sample = {'obs': 0, 'action': 0, 'reward': 0, 'new_obs': 0, 'steps': 1} weight = _random_weight() buf.add_sample(sample, init_weight=weight) weights.append(weight) for _ in range(5): _add_sample() for _ in range(1000): samples = buf.sample(3) importance = np.power(np.array(weights) + 0.5, 1.5) / np.sum(weights) importance = np.power(importance * len(importance), -0.5) importance /= np.max(importance) new_weights = [] for sample in samples: assert np.allclose(importance[sample['id']], sample['weight']) weight = _random_weight() weights[sample['id']] = weight new_weights.append(weight) buf.update_weights(samples, new_weights) _add_sample() if len(weights) > 10: weights = weights[1:]
def main(): env_name = 'MineRLNavigateDense-v0' """Run DQN until the environment throws an exception.""" base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)] env = BatchedFrameStack(BatchedGymEnv([base_env]), num_images=4, concat=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online, target = mine_rainbow_online_target(mine_cnn, sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online, target) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) buffer_capacity = 5000 replay_buffer = PrioritizedReplayBuffer(buffer_capacity, 0.5, 0.4, epsilon=0.1) iter = non_bugged_data_arr(env_name, num_trajs=100) expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3) for traj in expert_player.play(): replay_buffer.add_sample(traj, init_weight=1) print('starting training') dqn.train(num_steps=200, player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print('starting eval') player._cur_states = None score = evaluate(player) print(score)
def main(): env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-421, max_val=421)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=64, batch_size=32, min_buffer_size=25000)
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def main(): """Run DQN until the environment throws an exception.""" base_path = "results/rainbow/6/" env = make_env(stack=False, scale_rew=False, render=None, monitor=base_path + "train_monitor", episodic_life=True) # I think the env itself allows Backtracking env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver(name="rainbow") sess.run(tf.global_variables_initializer()) saver.save(sess, base_path + "training", global_step=0) try: dqn.train(num_steps=2_000_000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=64, min_buffer_size=20000, handle_ep=handle_ep) # in seconds except KeyboardInterrupt: print("keyboard interrupt") print("finishing") saver.save(sess, base_path + "final", global_step=2_000_000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps= num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4) sess.run(tf.global_variables_initializer()) """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def test_uniform_sampling(self): """ Test the buffer when it's configured to sample uniformly. """ np.random.seed(1337) buf = PrioritizedReplayBuffer(capacity=10, alpha=0, beta=1) for i in range(10): sample = { 'obs': 0, 'action': 0, 'reward': 0, 'new_obs': 0, 'steps': 1, 'idx': i } buf.add_sample(sample) sampled_idxs = [] for _ in range(10000): samples = buf.sample(3) sampled_idxs.extend([s['idx'] for s in samples]) buf.update_weights(samples, [s['idx'] for s in samples]) counts = Counter(sampled_idxs) for i in range(10): frac = counts[i] / len(sampled_idxs) self.assertGreater(frac, 0.09) self.assertLess(frac, 0.11)
def finish(self, sess, dqn): env = BatchedGymEnv([[self.env]]) return { "player": NStepPlayer(BatchedPlayer(self.env, dqn.online_net), 3), "optimize_op": dqn.optimize(learning_rate=0.002), "replay_buffer": PrioritizedReplayBuffer(20000, 0.5, 0.4, epsilon=0.2), }
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) # Other exploration schedules #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 10 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=[eps_decay_sched], handle_ep=_handle_ep, restore_path='./pretrained_model', save_interval=None, )
def main(): """Run DQN until the environment throws an exception.""" env_fns, env_names = create_envs() env = BatchedFrameStack(batched_gym_env(env_fns), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) # Use ADAM sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps= 2000000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, num_envs=len(env_fns), save_interval=10, )
num_images=4, concat=False) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) print(i, game, stage) print('training steps:', train_steps) start = time.time() dqn.train( num_steps= train_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=10000) end = time.time() print(end - start) print('closing env') env.close()
def main(): """Run DQN until the environment throws an exception.""" envs = make_envs(stack=False, scale_rew=False) for i in range(len(envs)): envs[i] = AllowBacktracking(envs[i]) envs[i] = BatchedFrameStack(BatchedGymEnv([[envs[i]]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online_model, target_model = rainbow_models( sess, envs[0].action_space.n, gym_space_vectorizer(envs[0].observation_space), min_val=-200, max_val=200) replay_buffer = PrioritizedReplayBuffer(400000, 0.5, 0.4, epsilon=0.1) dqn = DQN(online_model, target_model) players = [] for env in envs: player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) players.append(player) optimize = dqn.optimize(learning_rate=1e-4) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): saver = tf.train.Saver([ tf.get_variable(name) for name in [ 'online/layer_1/conv2d/kernel', 'online/layer_1/conv2d/bias', 'online/layer_2/conv2d/kernel', 'online/layer_2/conv2d/bias', 'online/layer_3/conv2d/kernel', 'online/layer_3/conv2d/bias', 'target/layer_1/conv2d/kernel', 'target/layer_1/conv2d/bias', 'target/layer_2/conv2d/kernel', 'target/layer_2/conv2d/bias', 'target/layer_3/conv2d/kernel', 'target/layer_3/conv2d/bias', ] ]) # or """ sess.run(tf.variables_initializer([tf.get_variable(name) for name in [ 'online/noisy_layer/weight_mu', 'online/noisy_layer/bias_mu', 'online/noisy_layer/weight_sigma', 'online/noisy_layer/bias_sigma', 'online/noisy_layer_1/weight_mu', 'online/noisy_layer_1/bias_mu', 'online/noisy_layer_1/weight_sigma', 'online/noisy_layer_1/bias_sigma', 'online/noisy_layer_2/weight_mu', 'online/noisy_layer_2/bias_mu', 'online/noisy_layer_2/weight_sigma', 'online/noisy_layer_2/bias_sigma', 'target/noisy_layer/weight_mu', 'target/noisy_layer/bias_mu', 'target/noisy_layer/weight_sigma', 'target/noisy_layer/bias_sigma', 'target/noisy_layer_1/weight_mu', 'target/noisy_layer_1/bias_mu', 'target/noisy_layer_1/weight_sigma', 'target/noisy_layer_1/bias_sigma', 'target/noisy_layer_2/weight_mu', 'target/noisy_layer_2/bias_mu', 'target/noisy_layer_2/weight_sigma', 'target/noisy_layer_2/bias_sigma', 'beta1_power', 'beta2_power', 'online/layer_1/conv2d/kernel/Adam', 'online/layer_1/conv2d/kernel/Adam_1', 'online/layer_1/conv2d/bias/Adam', 'online/layer_1/conv2d/bias/Adam_1', 'online/layer_2/conv2d/kernel/Adam', 'online/layer_2/conv2d/kernel/Adam_1', 'online/layer_2/conv2d/bias/Adam', 'online/layer_2/conv2d/bias/Adam_1', 'online/layer_3/conv2d/kernel/Adam', 'online/layer_3/conv2d/kernel/Adam_1', 'online/layer_3/conv2d/bias/Adam', 'online/layer_3/conv2d/bias/Adam_1', 'online/noisy_layer/weight_mu/Adam', 'online/noisy_layer/weight_mu/Adam_1', 'online/noisy_layer/bias_mu/Adam', 'online/noisy_layer/bias_mu/Adam_1', 'online/noisy_layer/weight_sigma/Adam', 'online/noisy_layer/weight_sigma/Adam_1', 'online/noisy_layer/bias_sigma/Adam', 'online/noisy_layer/bias_sigma/Adam_1', 'online/noisy_layer_1/weight_mu/Adam', 'online/noisy_layer_1/weight_mu/Adam_1', 'online/noisy_layer_1/bias_mu/Adam', 'online/noisy_layer_1/bias_mu/Adam_1', 'online/noisy_layer_1/weight_sigma/Adam', 'online/noisy_layer_1/weight_sigma/Adam_1', 'online/noisy_layer_1/bias_sigma/Adam', 'online/noisy_layer_1/bias_sigma/Adam_1', 'online/noisy_layer_2/weight_mu/Adam', 'online/noisy_layer_2/weight_mu/Adam_1', 'online/noisy_layer_2/bias_mu/Adam', 'online/noisy_layer_2/bias_mu/Adam_1', 'online/noisy_layer_2/weight_sigma/Adam', 'online/noisy_layer_2/weight_sigma/Adam_1', 'online/noisy_layer_2/bias_sigma/Adam', 'online/noisy_layer_2/bias_sigma/Adam_1', ]])) """ #sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables( tf.all_variables( ) ) ) ) ) ) sess.run(tf.global_variables_initializer()) # either saver.restore(sess, '/root/compo/model') # end either for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): print(i.name) while True: dqn.train(num_steps=16384, players=players, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) saver.save(sess, '/root/compo/out/model')
def train(batched_env, env_count=1, batch_size_multiplier=32, num_steps=2000000, pretrained_model='artifacts/model/model.cpkt', output_dir='artifacts/model', use_schedules=True): """ Trains on a batched_env using anyrl-py's dqn and rainbow model. env_count: The number of envs in batched_env batch_size_multiplier: batch_size of the dqn train call will be env_count * batch_size_multiplier num_steps: The number of steps to run training for pretrained_model: Load tf weights from this model file output_dir: Save tf weights to this file use_schedules: Enables the tf_schedules for the train call. Schedules require internet access, so don't include on retro-contest evaluation server """ env = CollisionMapWrapper(batched_env) env = BatchedResizeImageWrapper(env) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) scheduled_saver = ScheduledSaver(save_interval=10000, save_dir=output_dir) print('Outputting trained model to', output_dir) # Reporting uses BatchedPlayer to get _total_rewards batched_player = BatchedPlayer(env, dqn.online_net) player = NStepPlayer(batched_player, 3) optimize = dqn.optimize(learning_rate=1e-4) if pretrained_model is None: print('Initializing with random weights') sess.run(tf.global_variables_initializer()) else: print('Loading pre-trained model from', pretrained_model) scheduled_saver.saver.restore(sess, pretrained_model) print('Beginning Training, steps', num_steps) tf_schedules = [] if (use_schedules): tf_schedules = [ scheduled_saver, LosswiseSchedule(num_steps, batched_player), LoadingBar(num_steps) ] print(env_count * batch_size_multiplier) dqn.train( num_steps=num_steps, player=player, replay_buffer=PrioritizedReplayBuffer(300000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=env_count, target_interval=8192, batch_size=env_count * batch_size_multiplier, min_buffer_size=max(4500, env_count * batch_size_multiplier), # min_buffer_size=60, tf_schedules=tf_schedules, handle_ep=print) scheduled_saver.save(sess)
def main(): if local_env: # Select Random Level if local levels = ['SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3'] level_choice = random.randrange(0, 13, 1) env = make_env(stack=True, scale_rew=False, local=local_env, level_choice=level_choice) #-3 else: print('connecting to remote environment') env = grc.RemoteEnv('tmp/sock') print('starting episode') env = AllowBacktracking(env) solutions = env.solutions # Track Solutions state_size = env.observation_space action_size = env.action_space.n print(state_size, action_size) env.assist = False env.trainer = train # Begin with mentor led exploration env.reset() while env.total_steps_ever <= TOTAL_TIMESTEPS: # Interact with Retro environment until Total TimeSteps expire. while env.trainer: print('Entering Self Play') keys = getch() if keys == 'A': env.control(-1) if keys == 'B': env.control(4) if keys == 'C': env.control(3) if keys == 'D': env.control(2) buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], ['DOWN', 'B'], ['B']] if keys == 'rr': env.trainer = False continue if keys == ' ': env.close() env = make_env(stack=False, scale_rew=False, local=local_env) env = AllowBacktracking(env) env.reset() # Initialize Gaming Environment env.trainer = True if env.episode % RL_PLAY_PCT == 0: tf.reset_default_graph() with tf.Session() as sess: def make_net(name): return MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) bplayer = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) env.agent = 'DQN' dqn.train(num_steps=TRAINING_STEPS, player=bplayer, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('Exited DQN with : ' + str(rew) + str(env.steps))) new_ep = True # New Episode Flag while new_ep: if new_ep: if (solutions and random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS): new_state, new_rew, done = env.spawn() continue else: env.reset() new_ep = False env.agent = 'JERK' rew, new_ep = move(env, 100) if not new_ep and rew <= 0: #print('backtracking due to negative reward: %f' % rew) _, new_ep = move(env, 70, left=True) if new_ep: solutions.append(([max(env.reward_history)], env.best_sequence()))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--restore', '-restore', action='store_true', help='restore from checkpoint file') parser.add_argument('--record', '-record', action='store_true', help='record bk2 movies') args = parser.parse_args() """Run DQN until the environment throws an exception.""" env = AllowBacktracking( make_env(stack=False, scale_rew=False, record=args.record)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) checkpoint_dir = os.path.join(os.getcwd(), 'results') results_dir = os.path.join(os.getcwd(), 'results', time.strftime("%d-%m-%Y_%H-%M-%S")) if not os.path.exists(results_dir): os.makedirs(results_dir) summary_writer = tf.summary.FileWriter(results_dir) # TODO # env = wrappers.Monitor(env, results_dir, force=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) saver = tf.train.Saver() if args.restore: latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {} ...\n".format( latest_checkpoint)) saver.restore(sess, latest_checkpoint) else: print("Checkpoint not found") player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 # runs with every completed episode def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) summary_reward = tf.Summary() summary_reward.value.add(tag='global/reward', simple_value=rew) summary_writer.add_summary(summary_reward, global_step=total_steps) print('save model') saver.save(sess=sess, save_path=checkpoint_dir + '/model', global_step=total_steps) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) summary_meanreward = tf.Summary() summary_meanreward.value.add(tag='global/mean_reward', simple_value=sum(reward_hist) / len(reward_hist)) summary_writer.add_summary(summary_meanreward, global_step=total_steps) reward_hist.clear() dqn.train( num_steps=7000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep)
def main(): """Run DQN until the environment throws an exception.""" # Hyperparameters learning_rate = 2.5e-4 gamma = 0.99 nstep_return = 3 timesteps_per_proc = 50_000_000 train_interval = 4 target_interval = 8192 batch_size = 512 min_buffer_size = 20000 # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='starpilot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--data_aug', type=str, default='no_aug', choices=['no_aug', 'cutout_color', 'crop']) parser.add_argument('--PER', type=lambda x: bool(strtobool(x)), default=True, help='Whether to use PER') parser.add_argument('--num_envs', type=int, default=64) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_envs = args.num_envs # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup Rainbow models logger.info("building models") online_net, target_net = rainbow_models( sess, venv.action_space.n, gym_space_vectorizer(venv.observation_space), min_val=REWARD_RANGE_FOR_C51[env_name][0], max_val=REWARD_RANGE_FOR_C51[env_name][1]) dqn = MpiDQN(online_net, target_net, discount=gamma, comm=comm, mpi_rank_weight=mpi_rank_weight, mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, data_aug=args.data_aug) player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return) optimize = dqn.optimize(learning_rate=learning_rate) # Initialize and sync variables sess.run(tf.global_variables_initializer()) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if comm.Get_size() > 1: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E110 # Training logger.info("training") if args.PER: dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size) else: #set alpha and beta equal to 0 for uniform prioritization and no importance sampling dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0, 0, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size)
def main(): """Run DQN until the environment throws an exception.""" print('creating env') env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 print('starting tf session') with tf.Session(config=config) as sess: print('creating agent') online_net, target_net = rainbow_models(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online_net, target_net) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) train_steps = 5000 print('training steps:', train_steps) for j in range(1): print(j) start = time.time() dqn.train( num_steps= train_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=10000) end = time.time() print(end - start) print('done training') print('save nn') save_path = saver.save(sess, "saved_models/rainbow5.ckpt") print("Model saved in path: %s" % save_path) tvars = tf.trainable_variables() tvars_vals = sess.run(tvars) #for var, val in zip(tvars, tvars_vals): # print(var.name, val[0]) #print(tvars_vals[0][-5:]) #print('stepping') #obs = env.reset() #online_net.step(obs, obs) '''
def main(): if local_env: # Select Random Level if local from retro_contest.local import make levels = [ 'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3' ] level_choice = levels[random.randrange(0, 13, 1)] env = make(game='SonicTheHedgehog-Genesis', state=level_choice) else: print('connecting to remote environment') env = grc.RemoteEnv('tmp/sock') print('starting episode') env = TrackedEnv(env) solutions = env.solutions # Track Solutions state_size = env.observation_space action_size = env.action_space.n print(state_size, action_size) env.assist = False env.trainer = False # Begin with mentor led exploration env.resume_rl(True) # Begin with RL exploration env.reset() while env.total_steps_ever <= TOTAL_TIMESTEPS: # Interact with Retro environment until Total TimeSteps expire. while env.trainer: print('Entering Self Play') keys = getch() if keys == 'A': env.control(-1) if keys == 'B': env.control(4) if keys == 'C': env.control(3) if keys == 'D': env.control(2) if keys == 'rr': env.trainer = False continue if keys == ' ': env.close() env = make(game='SonicTheHedgehog-Genesis', state=levels[random.randrange(0, 13, 1)]) env = TrackedEnv(env) env.reset() # Initialize Gaming Environment env.trainer = True if env.steps > 1: print('Prev Rew', env.step_rew_history[-1], 'Curr_Loc', env.reward_history[-1], 'Med Rew', np.median(env.step_rew_history[-3:])) if env.episode % RL_PLAY_PCT == 0: tf.reset_default_graph() with tf.Session() as sess: def make_net(name): return MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) bplayer = BasicPlayer(env, EpsGreedyQNetwork( dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) env.agent = 'DQN' dqn.train( num_steps=TRAINING_STEPS, player=bplayer, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('Exited DQN with : ' + str( rew) + str(env.steps))) new_ep = True # New Episode Flag while new_ep: if new_ep: if (solutions and random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS): solutions = sorted(solutions, key=lambda x: np.mean(x[0])) best_pair = solutions[-1] new_rew = exploit(env, best_pair[1]) best_pair[0].append(new_rew) print('replayed best with reward %f' % new_rew) print(best_pair[0]) continue else: env.reset() new_ep = False env.agent = 'JERK' rew, new_ep = move(env, 100) if not new_ep and rew <= 0: #print('backtracking due to negative reward: %f' % rew) _, new_ep = move(env, 70, left=True) if new_ep: solutions.append( ([max(env.reward_history)], env.best_sequence()))
def main(): discount = os.environ.get('RETRO_DISCOUNT') if discount != None: discount = float(discount) else: discount = 0.99 print("DISCOUNT: %s" % (discount, )) """Run DQN until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 config.log_device_placement = False with tf.Session(config=config) as sess: state_encoder = StateEncoder(sess) env = make_batched_env() env_ids = env.env_ids env = BatchedFrameStack(env, num_images=4, concat=True) env.env_ids = env_ids env = ExplorationBatchedEnv(env, Exploration, state_encoder=state_encoder) if 'RETRO_POLICY_DIR' in os.environ: expert = PolicyExpert(sess, batch_size=1, policy_dir=os.environ['RETRO_POLICY_DIR']) elif not 'RETRO_NOEXPERT' in os.environ: expert = RandomMoveExpert() else: expert = None if os.environ['RETRO_DQN'] == 'soft_noisy_net': dqn = DQN(*soft_noisy_net_models( sess, env.action_space.n, gym_space_vectorizer(env.observation_space), discount=discount, #0.99 expert=expert)) elif os.environ['RETRO_DQN'] == 'soft_rainbow': dqn = DQN(*soft_rainbow_models( sess, env.action_space.n, gym_space_vectorizer(env.observation_space), num_atoms=101, min_val=-1000, #-200 max_val=1000, #200 discount=discount, #0.99 expert=expert)) if "RETRO_CHECKPOINT_DIR" in os.environ: scheduler_saver = ScheduledSaver( sess, os.environ["RETRO_CHECKPOINT_DIR"] + "/tensorflow/") else: scheduler_saver = None player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) if 'RETRO_INIT_DIR' in os.environ: saver = tf.train.Saver(var_list=list( filter( lambda v: not 'sigma' in v.name and not 'dqn_model/noisy_layer_1' in v.name and not 'dqn_model/noisy_layer_2' in v.name, tf.trainable_variables('^dqn_model/')))) latest_checkpoint = tf.train.latest_checkpoint( os.environ['RETRO_INIT_DIR']) print("DQN_INIT_CHECKPOINT: %s" % (latest_checkpoint, )) saver.restore(sess, latest_checkpoint) #from tensorflow.python.tools import inspect_checkpoint as chkp #chkp.print_tensors_in_checkpoint_file(latest_checkpoint,'',all_tensors=True) state_encoder.initialize() if expert: expert.initialize() replay_buffer = PrioritizedReplayBuffer(int( os.environ.get("RETRO_DQN_BUFFER_SIZE", 250000)), 0.5, 0.4, epsilon=0.1) dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=int( os.environ.get("RETRO_DQN_TARGET_INTERVAL", 8192)), batch_size=32, min_buffer_size=int( os.environ.get('RETRO_DQN_MIN_BUFFER_SIZE', 20000)), handle_ep=lambda steps, rew: scheduler_saver.handle_episode(steps) if scheduler_saver is not None else None)
def main(): """Run DQN until the environment throws an exception.""" #env = AllowBacktracking(make_env(stack=False, scale_rew=False)) #envs = make_training_envs() #env = BatchedFrameStack(BatchedGymEnv(envs), num_images=4, concat=False) #env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) envs = get_training_envs() game, state = random.choice(envs) env = make_training_env(game, state, stack=False, scale_rew=False) env = prep_env(env) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) loss = dqn.loss train_writer = tf.summary.FileWriter('./logs/multiple/train', sess.graph) tf.summary.scalar("loss", loss) reward = tf.Variable(0., name='reward', trainable=False) tf.summary.scalar('reward', tf.reduce_mean(reward)) steps = tf.Variable(0, name='steps', trainable=False) tf.summary.scalar('steps', tf.reduce_mean(steps)) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) print(tf.trainable_variables()) #graph = tf.get_default_graph() #restore_saver = tf.train.Saver({ # 'dense1/bias': graph.get_tensor_by_name('online/dense1/bias:0'), # 'dense1/kernel': graph.get_tensor_by_name('online/dense1/kernel:0'), # 'layer_1/bias': graph.get_tensor_by_name('online/layer_1/bias:0'), # 'layer_1/kernel': graph.get_tensor_by_name('online/layer_1/kernel:0'), # 'layer_2/bias': graph.get_tensor_by_name('online/layer_2/bias:0'), # 'layer_2/kernel': graph.get_tensor_by_name('online/layer_2/kernel:0'), # 'layer_3/bias': graph.get_tensor_by_name('online/layer_3/bias:0'), # 'layer_3/kernel': graph.get_tensor_by_name('online/layer_3/kernel:0'), # 'dense1/bias': graph.get_tensor_by_name('online_1/dense1/bias:0'), # 'dense1/kernel': graph.get_tensor_by_name('online_1/dense1/kernel:0'), # 'layer_1/bias': graph.get_tensor_by_name('online_1/layer_1/bias:0'), # 'layer_1/kernel': graph.get_tensor_by_name('online_1/layer_1/kernel:0'), # 'layer_2/bias': graph.get_tensor_by_name('online_1/layer_2/bias:0'), # 'layer_2/kernel': graph.get_tensor_by_name('online_1/layer_2/kernel:0'), # 'layer_3/bias': graph.get_tensor_by_name('online_1/layer_3/bias:0'), # 'layer_3/kernel': graph.get_tensor_by_name('online_1/layer_3/kernel:0'), # 'dense1/bias': graph.get_tensor_by_name('online_2/dense1/bias:0'), # 'dense1/kernel': graph.get_tensor_by_name('online_2/dense1/kernel:0'), # 'layer_1/bias': graph.get_tensor_by_name('online_2/layer_1/bias:0'), # 'layer_1/kernel': graph.get_tensor_by_name('online_2/layer_1/kernel:0'), # 'layer_2/bias': graph.get_tensor_by_name('online_2/layer_2/bias:0'), # 'layer_2/kernel': graph.get_tensor_by_name('online_2/layer_2/kernel:0'), # 'layer_3/bias': graph.get_tensor_by_name('online_2/layer_3/bias:0'), # 'layer_3/kernel': graph.get_tensor_by_name('online_2/layer_3/kernel:0'), # 'dense1/bias': graph.get_tensor_by_name('target/dense1/bias:0'), # 'dense1/kernel': graph.get_tensor_by_name('target/dense1/kernel:0'), # 'layer_1/bias': graph.get_tensor_by_name('target/layer_1/bias:0'), # 'layer_1/kernel': graph.get_tensor_by_name('target/layer_1/kernel:0'), # 'layer_2/bias': graph.get_tensor_by_name('target/layer_2/bias:0'), # 'layer_2/kernel': graph.get_tensor_by_name('target/layer_2/kernel:0'), # 'layer_3/bias': graph.get_tensor_by_name('target/layer_3/bias:0'), # 'layer_3/kernel': graph.get_tensor_by_name('target/layer_3/kernel:0'), # }) #restore_saver.restore(sess, './model-images/model.ckpt') #print('model restored') weights = joblib.load('./ppo2_weights_266.joblib') #[<tf.Variable 'model/c1/w:0' shape=(8, 8, 4, 32) dtype=float32_ref>, <tf.Variable 'model/c1/b:0' shape=(1, 32, 1, 1) dtype=float32_ref>, <tf.Variable 'model/c2/w:0' shape=(4, 4, 32, 64) dtype=float32_ref>, <tf.Variable 'model/c2/b:0' shape=(1, 64, 1, 1) dtype=float32_ref>, <tf.Variable 'model/c3/w:0' shape=(3, 3, 64, 64) dtype=float32_ref>, <tf.Variable 'model/c3/b:0' shape=(1, 64, 1, 1) dtype=float32_ref>, <tf.Variable 'model/fc1/w:0' shape=(3136, 512) dtype=float32_ref>, <tf.Variable 'model/fc1/b:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'model/v/w:0' shape=(512, 1) dtype=float32_ref>, <tf.Variable 'model/v/b:0' shape=(1,) dtype=float32_ref>, <tf.Variable 'model/pi/w:0' shape=(512, 7) dtype=float32_ref>, <tf.Variable 'model/pi/b:0' shape=(7,) dtype=float32_ref>] graph = tf.get_default_graph() for model in ['online', 'target']: tensor_names = [ '{}/layer_1/conv2d/kernel:0', '{}/layer_1/conv2d/bias:0', '{}/layer_2/conv2d/kernel:0', '{}/layer_2/conv2d/bias:0', '{}/layer_3/conv2d/kernel:0', '{}/layer_3/conv2d/bias:0', #'{}/dense1/kernel:0', #'{}/dense1/bias:0' ] for i in range(len(tensor_names)): tensor_name = tensor_names[i].format(model) tensor = graph.get_tensor_by_name(tensor_name) weight = weights[i] if 'bias' in tensor_name: weight = np.reshape(weight, tensor.get_shape()) print('about to assign {} value with size {}'.format( tensor_name, weights[i].shape)) sess.run(tf.assign(tensor, weight)) saver = tf.train.Saver() save_path = saver.save(sess, "./model/model.ckpt") print('Saved model') replay_buffer = PrioritizedReplayBuffer(100000, 0.5, 0.4, epsilon=0.1) #replay_buffer = pickle.load(gzip.open('./docker-build/model/replay_buffer.p.gz', 'rb')) #replay_buffer = pickle.load(open('./model/replay_buffer.p', 'rb')) total_steps = 50000000 steps_per_env = 5000 env.close() for i in range(int(total_steps / steps_per_env)): game, state = random.choice(envs) env = make_training_env(game, state, stack=False, scale_rew=False) env = prep_env(env) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) #dqn.train(num_steps=steps_per_env, # Make sure an exception arrives before we stop. # player=player, # replay_buffer=replay_buffer, # optimize_op=optimize, # train_interval=1, # target_interval=8192, # batch_size=32, # min_buffer_size=20000) summary = train( dqn, num_steps= steps_per_env, # Make sure an exception arrives before we stop. player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=4, target_interval=8192, batch_size=32, min_buffer_size=20000, summary_op=summary_op, handle_ep=lambda st, rew: (reward.assign(rew), steps.assign(st)), handle_step=lambda st, rew: (reward.assign(reward + rew), steps.assign(steps + st))) env.close() if summary: train_writer.add_summary(summary, i) else: print('No summary') save_path = saver.save(sess, "./model/model.ckpt") pickle.dump(replay_buffer, open("./model/replay_buffer.p", "wb")) print('Saved model')
def main(): """Run DQN until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 comm = MPI.COMM_WORLD # Use MPI for parallel evaluation rank = comm.Get_rank() size = comm.Get_size() env_fns, env_names = create_eval_envs() env = AllowBacktracking(env_fns[rank](stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: avg_score = sum(reward_hist[-100:]) / len(reward_hist[-100:]) # Global Score global_score = np.zeros(1) local_score = np.array(avg_score) print("Local Score for " + env_names[rank] + " at episode " + str(len(reward_hist)) + " with timesteps: " + str(total_steps) + ": " + str(local_score)) comm.Allreduce(local_score, global_score, op=MPI.SUM) global_score /= size if rank == 0: print("Global Average Score at episode: " + str(len(reward_hist)) + ": " + str(global_score)) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, save_interval=None, restore_path= './checkpoints_rainbow/model-10' # Model to be evaluated )