def nature_cnn(sess, env, stochastic=False, virtual_bn=False): """ Create a CNN policy for a game environment. """ if not virtual_bn: return CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space), stochastic) return NormalizedCNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space), stochastic, env)
def make_net(name): return MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), name, layer_sizes=[32])
def main(): """Run DQN until the environment throws an exception.""" base_path = "results/rainbow/6/" env = make_env(stack=False, scale_rew=False, render=None, monitor=base_path + "train_monitor", episodic_life=True) # I think the env itself allows Backtracking env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver(name="rainbow") sess.run(tf.global_variables_initializer()) saver.save(sess, base_path + "training", global_step=0) try: dqn.train(num_steps=2_000_000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=64, min_buffer_size=20000, handle_ep=handle_ep) # in seconds except KeyboardInterrupt: print("keyboard interrupt") print("finishing") saver.save(sess, base_path + "final", global_step=2_000_000)
def learn_cartpole(): """Train an agent.""" env = gym.make('CartPole-v0') try: agent = ActorCritic(gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) with tf.Session() as sess: a2c = A2C(sess, agent, target_kl=0.03) roller = BasicRoller(env, agent, min_episodes=8, min_steps=1024) while True: with agent.frozen(): rollouts = roller.rollouts() print('mean=%f' % (mean_total_reward(rollouts), )) agent.actor.extend( a2c.policy_update(rollouts, STEP_SIZE, NUM_STEPS, min_leaf=30)) agent.critic.extend( a2c.value_update(rollouts, VAL_STEP, NUM_STEPS, min_leaf=30)) finally: env.close()
def main(): with tf.Session() as sess: print('Creating environment...') env = TFBatchedEnv(sess, Pong(), 1) env = BatchedFrameStack(env) print('Creating model...') model = CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) print('Creating roller...') roller = TruncatedRoller(env, model, 1) print('Initializing variables...') sess.run(tf.global_variables_initializer()) if os.path.exists('params.pkl'): print('Loading parameters...') with open('params.pkl', 'rb') as in_file: params = pickle.load(in_file) for var, val in zip(tf.trainable_variables(), params): sess.run(tf.assign(var, val)) else: print('Warning: parameter file does not exist!') print('Running agent...') viewer = SimpleImageViewer() while True: for obs in roller.rollouts()[0].step_observations: viewer.imshow(obs[..., -3:])
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-421, max_val=421)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=64, batch_size=32, min_buffer_size=25000)
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def run_ppo(): """ Run a training worker. """ env = gym.make('CartPole-v0') action_dist = gym_space_distribution(env.action_space) obs_vectorizer = gym_space_vectorizer(env.observation_space) with tf.Session() as sess: model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32]) # Deal with CartPole-v0 reward scale. model.scale_outputs(20) roller = BasicRoller(env, model, min_episodes=30) ppo = PPO(model) optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3), -ppo.objective) sess.run(tf.global_variables_initializer()) optimizer.sync_from_root(sess) for i in range(50): rollouts = roller.rollouts() # pylint: disable=E1101 print('batch %d: rank=%d mean=%f' % (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts))) mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
def simple_mlp(sess, env, stochastic=False): """ Create a simple MLP policy for the environment. """ return MLP(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space), stochastic, (32, 32))
def main(): """ Entry-point for the program. """ env = gym.make('CartPole-v0') with tf.Session() as sess: make_net = lambda name: MLPQNetwork(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), name, layer_sizes=[32]) dqn = DQN(make_net('online'), make_net('target')) player = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON), batch_size=STEPS_PER_UPDATE) optimize = dqn.optimize(learning_rate=LEARNING_RATE) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=30000, player=player, replay_buffer=UniformReplayBuffer(BUFFER_SIZE), optimize_op=optimize, target_interval=200, batch_size=64, min_buffer_size=200, handle_ep=lambda _, rew: print('got reward: ' + str(rew))) env.close()
def create_model(args, sess): act_space = gym.spaces.MultiBinary(args.act_size) obs_space = gym.spaces.Box(low=0, high=0xff, shape=[args.obs_size] * 2 + [3], dtype='uint8') return CNN(sess, gym_space_distribution(act_space), gym_space_vectorizer(obs_space))
def learn_setup(env_id=None, timesteps=int(5e6), env_name=None, param_scale=1, name="test", expnum=0, env=None, n_episodes=None, n_steps_per_episode=None, reward_threshold=0, CMA_mu=None, CMA_cmean=None, CMA_rankmu=None, CMA_rankone=None, log_file=None): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if env_id is None: env_id = env_name if env is None: env = make_vec_env(env_id, "mujoco", 1, None, reward_scale=1.0, flatten_dict_observations=True) if log_file is None: log_file = os.path.join( 'results', "recent" + name + "_" + str(expnum) + ".monitor.csv") log_npy = os.path.join('results', "recent" + name + '_' + str(expnum) + '.npy') #env = LoggedEnv(env, log_file, log_npy) model = ContinuousMLP(sess, env.action_space, gym_space_vectorizer(env.observation_space)) roller = BasicRoller(env, model, min_episodes=1, min_steps=n_steps_per_episode) sess.run(tf.global_variables_initializer()) trainer = CMATrainer(sess, scale=param_scale, CMA_mu=CMA_mu, CMA_cmean=CMA_cmean, CMA_rankmu=CMA_rankmu, CMA_rankone=CMA_rankone) #, popsize=n_episodes) rewards = [] local_variables = { 'roller': roller, 'trainer': trainer, 'env_id': env_name, 'reward_threshold': reward_threshold, 'rewards': rewards } return local_variables
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4) sess.run(tf.global_variables_initializer()) """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def build_network(self, sess, name): layer_sizes = [self.args['layer_1_size'], self.args['layer_2_size']] if self.args['has_third_layer']: layer_sizes.append( geom_mean(self.args['layer_2_size'], self.env.action_space.n)) return MLPQNetwork(sess, self.env.action_space.n, gym_space_vectorizer(self.env.observation_space), name, layer_sizes=layer_sizes)
def main(): """Run DQN until the environment throws an exception.""" # "results/rainbow/2/videos/6" save_dir = "results/rainbow/7/val_monitor/2" env = make_env(stack=False, scale_rew=False, render=60, monitor=save_dir, timelimit=False, episodic_life=False, single_life=True, video=lambda id: True) # env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() with tf.Session(config=config) as sess: saver = tf.train.import_meta_graph( "results/rainbow/7/final-4000000.meta", clear_devices=True) # saver.restore(sess, tf.train.latest_checkpoint('results/rainbow/2')) saver.restore(sess, 'results/rainbow/7/final-4000000') model = LoadedNetwork(sess, gym_space_vectorizer(env.observation_space)) # rebuild the online_net form the saved model # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???> player = NStepPlayer(BatchedPlayer(env, model), 3) with tf.device("/cpu"): # sess.run(tf.global_variables_initializer()) try: for episode_index in tqdm(range(40), unit="episode"): axes = make_axes() plotter = RewardPlotter(axes, save_period=40, render_period=600, max_entries=600) for i in count(): trajectories = player.play() end_of_episode = False current_total_reward = None for trajectory in trajectories: current_total_reward = trajectory["total_reward"] if trajectory["is_last"]: end_of_episode = True plotter.update(current_total_reward, step=i) if end_of_episode: # plt.show() plotter.render() plotter.save_file("{}/e{}.pdf".format( save_dir, episode_index)) plotter.close() break except KeyboardInterrupt: env.close() plt.close()
def __init__(self, *args, **kwargs): super(ACTest, self).__init__(*args, **kwargs) self.session = tf.Session() env = TupleCartPole() try: action_space = env.action_space observation_space = env.observation_space finally: env.close() self.action_dist = gym_space_distribution(action_space) self.obs_vectorizer = gym_space_vectorizer(observation_space)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) # Other exploration schedules #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 10 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=[eps_decay_sched], handle_ep=_handle_ep, restore_path='./pretrained_model', save_interval=None, )
def build_network(self, sess, name): return MLPDistQNetwork(sess, self.env.action_space.n, gym_space_vectorizer( self.env.observation_space), name, 51, -10, 10, layer_sizes=layer_sizes, dueling=True, dense=partial(noisy_net_dense, sigma0=self.args['sigma0']))
def main(): env_name = 'MineRLNavigateDense-v0' """Run DQN until the environment throws an exception.""" base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)] env = BatchedFrameStack(BatchedGymEnv([base_env]), num_images=4, concat=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online, target = mine_rainbow_online_target(mine_cnn, sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online, target) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) buffer_capacity = 5000 replay_buffer = PrioritizedReplayBuffer(buffer_capacity, 0.5, 0.4, epsilon=0.1) iter = non_bugged_data_arr(env_name, num_trajs=100) expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3) for traj in expert_player.play(): replay_buffer.add_sample(traj, init_weight=1) print('starting training') dqn.train(num_steps=200, player=player, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print('starting eval') player._cur_states = None score = evaluate(player) print(score)
def run_ac_test(maker): """ Run a test given a model constructor. """ env = TupleCartPole() try: action_space = env.action_space observation_space = env.observation_space finally: env.close() action_dist = gym_space_distribution(action_space) obs_vectorizer = gym_space_vectorizer(observation_space) ModelTester( lambda sess: maker(sess, action_dist, obs_vectorizer)).test_all()
def main(): """Run DQN until the environment throws an exception.""" env_fns, env_names = create_envs() env = BatchedFrameStack(batched_gym_env(env_fns), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) # Use ADAM sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps= 2000000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, num_envs=len(env_fns), save_interval=10, )
def main(): """ Entry-point for the program. """ args = _parse_args() env = batched_gym_env([partial(make_single_env, args.game)] * args.workers) # Using BatchedFrameStack with concat=False is more # memory efficient than other stacking options. env = BatchedFrameStack(env, num_images=4, concat=False) with tf.Session() as sess: make_net = lambda name: NatureQNetwork( sess, env.action_space.n, gym_space_vectorizer(env.observation_space), name, dueling=True) dqn = DQN(make_net('online'), make_net('target')) player = BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, args.epsilon)) optimize = dqn.optimize(learning_rate=args.lr) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) reward_hist.clear() dqn.train(num_steps=int(1e7), player=player, replay_buffer=UniformReplayBuffer(args.buffer_size), optimize_op=optimize, target_interval=args.target_interval, batch_size=args.batch_size, min_buffer_size=args.min_buffer_size, handle_ep=_handle_ep) env.close()
def main(): """Run DQN until the environment throws an exception.""" # "results/rainbow/2/videos/6" env = make_env(stack=False, scale_rew=False, render=20, monitor=None, timelimit=False) # env = AllowBacktracking(make_env(stack=False, scale_rew=False)) # TODO we might not want to allow backtracking, it kinda hurts in mario env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = 0.6 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) # TODO rebuild the online_net form the saved model # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???> # important methods # model = dqn.online_net player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) with tf.device("/cpu"): # sess.run(tf.global_variables_initializer()) vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) try: for i in tqdm(range(100000)): trajectories = player.play() for trajectori in trajectories: trajectori pass except KeyboardInterrupt: env.close()
def learn_pong(): """Train an agent.""" env = batched_gym_env([make_single_env] * NUM_WORKERS) try: agent = ActorCritic(gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) with tf.Session() as sess: a2c = A2C(sess, agent, target_kl=TARGET_KL) roller = TruncatedRoller(env, agent, HORIZON) total_steps = 0 rewards = [] print("Training... Don't expect progress for ~400K steps.") while True: with agent.frozen(): rollouts = roller.rollouts() for rollout in rollouts: total_steps += rollout.num_steps if not rollout.trunc_end: rewards.append(rollout.total_reward) agent.actor.extend( a2c.policy_update(rollouts, POLICY_STEP, NUM_STEPS, min_leaf=MIN_LEAF, feature_frac=FEATURE_FRAC)) agent.critic.extend( a2c.value_update(rollouts, VALUE_STEP, NUM_STEPS, min_leaf=MIN_LEAF, feature_frac=FEATURE_FRAC)) if rewards: print( '%d steps: mean=%f' % (total_steps, sum(rewards[-10:]) / len(rewards[-10:]))) else: print('%d steps: no episodes complete yet' % total_steps) finally: env.close()
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "/root/compo/model.ckpt") #print('model restored') replay_buffer = pickle.load( gzip.open('/root/compo/replay_buffer.p.gz', 'rb')) replay_buffer.alpha = 0.2 replay_buffer.beta = 0.4 replay_buffer.capacity = 100000 restore_ppo2_weights(sess) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer= replay_buffer, #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=4, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): with tf.Session() as sess: print('Creating environment...') env = TFBatchedEnv(sess, Pong(), 8) env = BatchedFrameStack(env) print('Creating model...') model = CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) print('Creating roller...') roller = TruncatedRoller(env, model, 128) print('Creating PPO graph...') ppo = PPO(model) optimize = ppo.optimize(learning_rate=3e-4) print('Initializing variables...') sess.run(tf.global_variables_initializer()) print('Training agent...') for i in count(): rollouts = roller.rollouts() for rollout in rollouts: if not rollout.trunc_end: print('reward=%f steps=%d' % (rollout.total_reward, rollout.total_steps)) total_steps = sum(r.num_steps for r in rollouts) ppo.run_optimize(optimize, rollouts, batch_size=total_steps // 4, num_iter=12, log_fn=print) if i % 5 == 0: print('Saving...') parameters = sess.run(tf.trainable_variables()) with open('params.pkl', 'wb+') as out_file: pickle.dump(parameters, out_file)
def run_algorithm(algo_name): """ Run the specified training algorithm. """ env = gym.make('CartPole-v0') action_dist = gym_space_distribution(env.action_space) obs_vectorizer = gym_space_vectorizer(env.observation_space) with tf.Session() as sess: model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32]) # Deal with CartPole-v0 reward scale. model.scale_outputs(20) roller = BasicRoller(env, model, min_episodes=30) inner_loop = algorithm_inner_loop(algo_name, model) sess.run(tf.global_variables_initializer()) print('running algorithm:', algo_name) for i in range(50): rollouts = roller.rollouts() print('batch %d: mean=%f' % (i, mean_total_reward(rollouts))) inner_loop(rollouts)
def training_loop(env_id=None, timesteps=int(5e6), param_scale=1, log_file=None): """ Run CMA on the environment. """ if log_file is None: log_file = os.path.join('results', env_id + '.monitor.csv') env = LoggedEnv(gym.make(env_id), log_file) with tf.Session() as sess: model = ContinuousMLP(sess, env.action_space, gym_space_vectorizer(env.observation_space)) roller = BasicRoller(env, model, min_episodes=4, min_steps=500) sess.run(tf.global_variables_initializer()) trainer = CMATrainer(sess, scale=param_scale) steps = 0 rewards = [] while steps < timesteps: sub_steps, sub_rewards = trainer.train(roller) steps += sub_steps rewards.extend(sub_rewards) print('%s: steps=%d mean=%f batch_mean=%f' % (env_id, steps, np.mean(rewards), np.mean(sub_rewards)))