def main(): """Run DQN until the environment throws an exception.""" # "results/rainbow/2/videos/6" save_dir = "results/rainbow/7/val_monitor/2" env = make_env(stack=False, scale_rew=False, render=60, monitor=save_dir, timelimit=False, episodic_life=False, single_life=True, video=lambda id: True) # env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() with tf.Session(config=config) as sess: saver = tf.train.import_meta_graph( "results/rainbow/7/final-4000000.meta", clear_devices=True) # saver.restore(sess, tf.train.latest_checkpoint('results/rainbow/2')) saver.restore(sess, 'results/rainbow/7/final-4000000') model = LoadedNetwork(sess, gym_space_vectorizer(env.observation_space)) # rebuild the online_net form the saved model # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???> player = NStepPlayer(BatchedPlayer(env, model), 3) with tf.device("/cpu"): # sess.run(tf.global_variables_initializer()) try: for episode_index in tqdm(range(40), unit="episode"): axes = make_axes() plotter = RewardPlotter(axes, save_period=40, render_period=600, max_entries=600) for i in count(): trajectories = player.play() end_of_episode = False current_total_reward = None for trajectory in trajectories: current_total_reward = trajectory["total_reward"] if trajectory["is_last"]: end_of_episode = True plotter.update(current_total_reward, step=i) if end_of_episode: # plt.show() plotter.render() plotter.save_file("{}/e{}.pdf".format( save_dir, episode_index)) plotter.close() break except KeyboardInterrupt: env.close() plt.close()
def test_batched_stack(concat): """ Test that BatchedFrameStack is equivalent to a regular batched FrameStackEnv. """ envs = [ lambda idx=i: SimpleEnv(idx + 2, (3, 2, 5), 'float32') for i in range(6) ] env1 = BatchedFrameStack(batched_gym_env(envs, num_sub_batches=3, sync=True), concat=concat) env2 = batched_gym_env( [lambda env=e: FrameStackEnv(env(), concat=concat) for e in envs], num_sub_batches=3, sync=True) for j in range(50): for i in range(3): if j == 0 or (j + i) % 17 == 0: env1.reset_start(sub_batch=i) env2.reset_start(sub_batch=i) obs1 = env1.reset_wait(sub_batch=i) obs2 = env2.reset_wait(sub_batch=i) assert np.allclose(obs1, obs2) actions = [env1.action_space.sample() for _ in range(2)] env1.step_start(actions, sub_batch=i) env2.step_start(actions, sub_batch=i) obs1, rews1, dones1, _ = env1.step_wait(sub_batch=i) obs2, rews2, dones2, _ = env2.step_wait(sub_batch=i) assert np.allclose(obs1, obs2) assert np.array(rews1 == rews2).all() assert np.array(dones1 == dones2).all()
def main(): """ Entry-point for the program. """ args = _parse_args() env = batched_gym_env([partial(make_single_env, args.game)] * args.workers) # Using BatchedFrameStack with concat=False is more # memory efficient than other stacking options. env = BatchedFrameStack(env, num_images=4, concat=False) with tf.Session() as sess: def make_net(name): return NatureQNetwork(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), name, dueling=True) dqn = DQN(make_net('online'), make_net('target')) player = BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, args.epsilon)) optimize = dqn.optimize(learning_rate=args.lr) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) reward_hist.clear() dqn.train(num_steps=int(1e7), player=player, replay_buffer=UniformReplayBuffer(args.buffer_size), optimize_op=optimize, target_interval=args.target_interval, batch_size=args.batch_size, min_buffer_size=args.min_buffer_size, handle_ep=_handle_ep) env.close()
def main(): """Run DQN until the environment throws an exception.""" base_path = "results/rainbow/6/" env = make_env(stack=False, scale_rew=False, render=None, monitor=base_path + "train_monitor", episodic_life=True) # I think the env itself allows Backtracking env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver(name="rainbow") sess.run(tf.global_variables_initializer()) saver.save(sess, base_path + "training", global_step=0) try: dqn.train(num_steps=2_000_000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=64, min_buffer_size=20000, handle_ep=handle_ep) # in seconds except KeyboardInterrupt: print("keyboard interrupt") print("finishing") saver.save(sess, base_path + "final", global_step=2_000_000)
def main(): with tf.Session() as sess: print('Creating environment...') env = TFBatchedEnv(sess, Pong(), 1) env = BatchedFrameStack(env) print('Creating model...') model = CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) print('Creating roller...') roller = TruncatedRoller(env, model, 1) print('Initializing variables...') sess.run(tf.global_variables_initializer()) if os.path.exists('params.pkl'): print('Loading parameters...') with open('params.pkl', 'rb') as in_file: params = pickle.load(in_file) for var, val in zip(tf.trainable_variables(), params): sess.run(tf.assign(var, val)) else: print('Warning: parameter file does not exist!') print('Running agent...') viewer = SimpleImageViewer() while True: for obs in roller.rollouts()[0].step_observations: viewer.imshow(obs[..., -3:])
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-421, max_val=421)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=64, batch_size=32, min_buffer_size=25000)
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4) sess.run(tf.global_variables_initializer()) """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" # "results/rainbow/2/videos/6" env = make_env(stack=False, scale_rew=False, render=20, monitor=None, timelimit=False) # env = AllowBacktracking(make_env(stack=False, scale_rew=False)) # TODO we might not want to allow backtracking, it kinda hurts in mario env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = 0.6 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) # TODO rebuild the online_net form the saved model # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???> # important methods # model = dqn.online_net player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) with tf.device("/cpu"): # sess.run(tf.global_variables_initializer()) vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) try: for i in tqdm(range(100000)): trajectories = player.play() for trajectori in trajectories: trajectori pass except KeyboardInterrupt: env.close()
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) # Other exploration schedules #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 10 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=[eps_decay_sched], handle_ep=_handle_ep, restore_path='./pretrained_model', save_interval=None, )
def main(): env_name = 'MineRLNavigateDense-v0' """Run DQN until the environment throws an exception.""" base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)] env = BatchedFrameStack(BatchedGymEnv([base_env]), num_images=4, concat=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online, target = mine_rainbow_online_target(mine_cnn, sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online, target) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) buffer_capacity = 5000 replay_buffer = PrioritizedReplayBuffer(buffer_capacity, 0.5, 0.4, epsilon=0.1) iter = non_bugged_data_arr(env_name, num_trajs=100) expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3) for traj in expert_player.play(): replay_buffer.add_sample(traj, init_weight=1) print('starting training') dqn.train(num_steps=200, player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print('starting eval') player._cur_states = None score = evaluate(player) print(score)
def main(): """Run DQN until the environment throws an exception.""" env_fns, env_names = create_envs() env = BatchedFrameStack(batched_gym_env(env_fns), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) # Use ADAM sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps= 2000000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, num_envs=len(env_fns), save_interval=10, )
def main(): with tf.Session() as sess: print('Creating environment...') env = TFBatchedEnv(sess, Pong(), 8) env = BatchedFrameStack(env) print('Creating model...') model = CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) print('Creating roller...') roller = TruncatedRoller(env, model, 128) print('Creating PPO graph...') ppo = PPO(model) optimize = ppo.optimize(learning_rate=3e-4) print('Initializing variables...') sess.run(tf.global_variables_initializer()) print('Training agent...') for i in count(): rollouts = roller.rollouts() for rollout in rollouts: if not rollout.trunc_end: print('reward=%f steps=%d' % (rollout.total_reward, rollout.total_steps)) total_steps = sum(r.num_steps for r in rollouts) ppo.run_optimize(optimize, rollouts, batch_size=total_steps // 4, num_iter=12, log_fn=print) if i % 5 == 0: print('Saving...') parameters = sess.run(tf.trainable_variables()) with open('params.pkl', 'wb+') as out_file: pickle.dump(parameters, out_file)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "/root/compo/model.ckpt") #print('model restored') replay_buffer = pickle.load( gzip.open('/root/compo/replay_buffer.p.gz', 'rb')) replay_buffer.alpha = 0.2 replay_buffer.beta = 0.4 replay_buffer.capacity = 100000 restore_ppo2_weights(sess) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer= replay_buffer, #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=4, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """ Entry-point for the program. """ args = _parse_args() # batched env = creates gym env, not sure what batched means # make_single_env = GrayscaleEnv > DownsampleEnv # GrayscaleEnv = turns RGB into grayscale # DownsampleEnv = down samples observation by N times where N is the specified variable (e.g. 2x smaller) env = batched_gym_env([partial(make_single_env, args.game)] * args.workers) env_test = make_single_env(args.game) #make_single_env(args.game) print('OBSSSS', env_test.observation_space) #env = CustomWrapper(args.game) # Using BatchedFrameStack with concat=False is more # memory efficient than other stacking options. env = BatchedFrameStack(env, num_images=4, concat=False) with tf.Session() as sess: def make_net(name): return rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200) dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, args.epsilon)) optimize = dqn.optimize(learning_rate=args.lr) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) reward_hist.clear() dqn.train(num_steps=int(1e7), player=player, replay_buffer=UniformReplayBuffer(args.buffer_size), optimize_op=optimize, target_interval=args.target_interval, batch_size=args.batch_size, min_buffer_size=args.min_buffer_size, handle_ep=_handle_ep) env.close()
def wrap_env(env): env = ObsWrapperBatcher(env, DownsampleEnv, 4) env = BatchedFrameStack(env, num_images=4, concat=False) return env
def main(): """Run DQN until the environment throws an exception.""" envs = make_envs(stack=False, scale_rew=False) for i in range(len(envs)): envs[i] = AllowBacktracking(envs[i]) envs[i] = BatchedFrameStack(BatchedGymEnv([[envs[i]]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online_model, target_model = rainbow_models( sess, envs[0].action_space.n, gym_space_vectorizer(envs[0].observation_space), min_val=-200, max_val=200) replay_buffer = PrioritizedReplayBuffer(400000, 0.5, 0.4, epsilon=0.1) dqn = DQN(online_model, target_model) players = [] for env in envs: player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) players.append(player) optimize = dqn.optimize(learning_rate=1e-4) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): saver = tf.train.Saver([ tf.get_variable(name) for name in [ 'online/layer_1/conv2d/kernel', 'online/layer_1/conv2d/bias', 'online/layer_2/conv2d/kernel', 'online/layer_2/conv2d/bias', 'online/layer_3/conv2d/kernel', 'online/layer_3/conv2d/bias', 'target/layer_1/conv2d/kernel', 'target/layer_1/conv2d/bias', 'target/layer_2/conv2d/kernel', 'target/layer_2/conv2d/bias', 'target/layer_3/conv2d/kernel', 'target/layer_3/conv2d/bias', ] ]) # or """ sess.run(tf.variables_initializer([tf.get_variable(name) for name in [ 'online/noisy_layer/weight_mu', 'online/noisy_layer/bias_mu', 'online/noisy_layer/weight_sigma', 'online/noisy_layer/bias_sigma', 'online/noisy_layer_1/weight_mu', 'online/noisy_layer_1/bias_mu', 'online/noisy_layer_1/weight_sigma', 'online/noisy_layer_1/bias_sigma', 'online/noisy_layer_2/weight_mu', 'online/noisy_layer_2/bias_mu', 'online/noisy_layer_2/weight_sigma', 'online/noisy_layer_2/bias_sigma', 'target/noisy_layer/weight_mu', 'target/noisy_layer/bias_mu', 'target/noisy_layer/weight_sigma', 'target/noisy_layer/bias_sigma', 'target/noisy_layer_1/weight_mu', 'target/noisy_layer_1/bias_mu', 'target/noisy_layer_1/weight_sigma', 'target/noisy_layer_1/bias_sigma', 'target/noisy_layer_2/weight_mu', 'target/noisy_layer_2/bias_mu', 'target/noisy_layer_2/weight_sigma', 'target/noisy_layer_2/bias_sigma', 'beta1_power', 'beta2_power', 'online/layer_1/conv2d/kernel/Adam', 'online/layer_1/conv2d/kernel/Adam_1', 'online/layer_1/conv2d/bias/Adam', 'online/layer_1/conv2d/bias/Adam_1', 'online/layer_2/conv2d/kernel/Adam', 'online/layer_2/conv2d/kernel/Adam_1', 'online/layer_2/conv2d/bias/Adam', 'online/layer_2/conv2d/bias/Adam_1', 'online/layer_3/conv2d/kernel/Adam', 'online/layer_3/conv2d/kernel/Adam_1', 'online/layer_3/conv2d/bias/Adam', 'online/layer_3/conv2d/bias/Adam_1', 'online/noisy_layer/weight_mu/Adam', 'online/noisy_layer/weight_mu/Adam_1', 'online/noisy_layer/bias_mu/Adam', 'online/noisy_layer/bias_mu/Adam_1', 'online/noisy_layer/weight_sigma/Adam', 'online/noisy_layer/weight_sigma/Adam_1', 'online/noisy_layer/bias_sigma/Adam', 'online/noisy_layer/bias_sigma/Adam_1', 'online/noisy_layer_1/weight_mu/Adam', 'online/noisy_layer_1/weight_mu/Adam_1', 'online/noisy_layer_1/bias_mu/Adam', 'online/noisy_layer_1/bias_mu/Adam_1', 'online/noisy_layer_1/weight_sigma/Adam', 'online/noisy_layer_1/weight_sigma/Adam_1', 'online/noisy_layer_1/bias_sigma/Adam', 'online/noisy_layer_1/bias_sigma/Adam_1', 'online/noisy_layer_2/weight_mu/Adam', 'online/noisy_layer_2/weight_mu/Adam_1', 'online/noisy_layer_2/bias_mu/Adam', 'online/noisy_layer_2/bias_mu/Adam_1', 'online/noisy_layer_2/weight_sigma/Adam', 'online/noisy_layer_2/weight_sigma/Adam_1', 'online/noisy_layer_2/bias_sigma/Adam', 'online/noisy_layer_2/bias_sigma/Adam_1', ]])) """ #sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables( tf.all_variables( ) ) ) ) ) ) sess.run(tf.global_variables_initializer()) # either saver.restore(sess, '/root/compo/model') # end either for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): print(i.name) while True: dqn.train(num_steps=16384, players=players, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) saver.save(sess, '/root/compo/out/model')
from anyrl.envs.wrappers import BatchedFrameStack from anyrl.models import rainbow_models from anyrl.rollouts import BatchedPlayer, PrioritizedReplayBuffer, NStepPlayer from anyrl.spaces import gym_space_vectorizer, StackedBoxSpace import gym_remote.exceptions as gre from sonic_util import AllowBacktracking, make_env import numpy as np print('creating env') #z = StackedBoxSpace(np.zeros((84,84,1)), 4) env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) #print(env.action_space.n) #StackedBox(84,84,1) config = tf.ConfigProto() config.gpu_options.allow_growth = True print('starting tf session') with tf.Session(config=config) as sess: print('creating agent') online_net, target_net = rainbow_models(sess,
def main(): """Run DQN until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 comm = MPI.COMM_WORLD # Use MPI for parallel evaluation rank = comm.Get_rank() size = comm.Get_size() env_fns, env_names = create_eval_envs() env = AllowBacktracking(env_fns[rank](stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: avg_score = sum(reward_hist[-100:]) / len(reward_hist[-100:]) # Global Score global_score = np.zeros(1) local_score = np.array(avg_score) print("Local Score for " + env_names[rank] + " at episode " + str(len(reward_hist)) + " with timesteps: " + str(total_steps) + ": " + str(local_score)) comm.Allreduce(local_score, global_score, op=MPI.SUM) global_score /= size if rank == 0: print("Global Average Score at episode: " + str(len(reward_hist)) + ": " + str(global_score)) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, save_interval=None, restore_path= './checkpoints_rainbow/model-10' # Model to be evaluated )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--restore', '-restore', action='store_true', help='restore from checkpoint file') parser.add_argument('--record', '-record', action='store_true', help='record bk2 movies') args = parser.parse_args() """Run DQN until the environment throws an exception.""" env = AllowBacktracking( make_env(stack=False, scale_rew=False, record=args.record)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) checkpoint_dir = os.path.join(os.getcwd(), 'results') results_dir = os.path.join(os.getcwd(), 'results', time.strftime("%d-%m-%Y_%H-%M-%S")) if not os.path.exists(results_dir): os.makedirs(results_dir) summary_writer = tf.summary.FileWriter(results_dir) # TODO # env = wrappers.Monitor(env, results_dir, force=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) saver = tf.train.Saver() if args.restore: latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {} ...\n".format( latest_checkpoint)) saver.restore(sess, latest_checkpoint) else: print("Checkpoint not found") player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 # runs with every completed episode def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) summary_reward = tf.Summary() summary_reward.value.add(tag='global/reward', simple_value=rew) summary_writer.add_summary(summary_reward, global_step=total_steps) print('save model') saver.save(sess=sess, save_path=checkpoint_dir + '/model', global_step=total_steps) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) summary_meanreward = tf.Summary() summary_meanreward.value.add(tag='global/mean_reward', simple_value=sum(reward_hist) / len(reward_hist)) summary_writer.add_summary(summary_meanreward, global_step=total_steps) reward_hist.clear() dqn.train( num_steps=7000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep)
def main(): """Run DQN until the environment throws an exception.""" print('creating env') env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 print('starting tf session') with tf.Session(config=config) as sess: print('creating agent') online_net, target_net = rainbow_models(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online_net, target_net) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) train_steps = 5000 print('training steps:', train_steps) for j in range(1): print(j) start = time.time() dqn.train( num_steps= train_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=10000) end = time.time() print(end - start) print('done training') print('save nn') save_path = saver.save(sess, "saved_models/rainbow5.ckpt") print("Model saved in path: %s" % save_path) tvars = tf.trainable_variables() tvars_vals = sess.run(tvars) #for var, val in zip(tvars, tvars_vals): # print(var.name, val[0]) #print(tvars_vals[0][-5:]) #print('stepping') #obs = env.reset() #online_net.step(obs, obs) '''
def prep_env(env): env = AllowBacktracking(env) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) return env
def main(): discount = os.environ.get('RETRO_DISCOUNT') if discount != None: discount = float(discount) else: discount = 0.99 print("DISCOUNT: %s" % (discount, )) """Run DQN until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 config.log_device_placement = False with tf.Session(config=config) as sess: state_encoder = StateEncoder(sess) env = make_batched_env() env_ids = env.env_ids env = BatchedFrameStack(env, num_images=4, concat=True) env.env_ids = env_ids env = ExplorationBatchedEnv(env, Exploration, state_encoder=state_encoder) if 'RETRO_POLICY_DIR' in os.environ: expert = PolicyExpert(sess, batch_size=1, policy_dir=os.environ['RETRO_POLICY_DIR']) elif not 'RETRO_NOEXPERT' in os.environ: expert = RandomMoveExpert() else: expert = None if os.environ['RETRO_DQN'] == 'soft_noisy_net': dqn = DQN(*soft_noisy_net_models( sess, env.action_space.n, gym_space_vectorizer(env.observation_space), discount=discount, #0.99 expert=expert)) elif os.environ['RETRO_DQN'] == 'soft_rainbow': dqn = DQN(*soft_rainbow_models( sess, env.action_space.n, gym_space_vectorizer(env.observation_space), num_atoms=101, min_val=-1000, #-200 max_val=1000, #200 discount=discount, #0.99 expert=expert)) if "RETRO_CHECKPOINT_DIR" in os.environ: scheduler_saver = ScheduledSaver( sess, os.environ["RETRO_CHECKPOINT_DIR"] + "/tensorflow/") else: scheduler_saver = None player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) if 'RETRO_INIT_DIR' in os.environ: saver = tf.train.Saver(var_list=list( filter( lambda v: not 'sigma' in v.name and not 'dqn_model/noisy_layer_1' in v.name and not 'dqn_model/noisy_layer_2' in v.name, tf.trainable_variables('^dqn_model/')))) latest_checkpoint = tf.train.latest_checkpoint( os.environ['RETRO_INIT_DIR']) print("DQN_INIT_CHECKPOINT: %s" % (latest_checkpoint, )) saver.restore(sess, latest_checkpoint) #from tensorflow.python.tools import inspect_checkpoint as chkp #chkp.print_tensors_in_checkpoint_file(latest_checkpoint,'',all_tensors=True) state_encoder.initialize() if expert: expert.initialize() replay_buffer = PrioritizedReplayBuffer(int( os.environ.get("RETRO_DQN_BUFFER_SIZE", 250000)), 0.5, 0.4, epsilon=0.1) dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=int( os.environ.get("RETRO_DQN_TARGET_INTERVAL", 8192)), batch_size=32, min_buffer_size=int( os.environ.get('RETRO_DQN_MIN_BUFFER_SIZE', 20000)), handle_ep=lambda steps, rew: scheduler_saver.handle_episode(steps) if scheduler_saver is not None else None)