def visualize( logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config, outdir), num_agents, env_processes) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver( exclude=(r'.*_temporary.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def run_setting(config, checkpoint, sampling_index,latency_index,num_episodes): with tf.device('/cpu:0'): batch_env = utility.define_batch_env(lambda: _create_environment(config, sampling_index, latency_index),num_agents=1, env_processes=False) #batch_env.sampling_interval= sampling_interval #batch_env.latency = latency graph = utility.define_simulation_graph( batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver(exclude=(r'.*_temporary.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass tf.reset_default_graph() return batch_env.rewards_list
def visualize( logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config, outdir), num_agents, env_processes) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver( exclude=(r'.*_temporary/.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def testing(config, off_data, off_label, def_data, def_label, outdir): """ Args ---- config : Object providing configurations via attributes. Yields ------ score : Evaluation scores. """ # split into train and eval off_train_data, off_eval_data = np.split(off_data, [off_data.shape[0] * 9 // 10]) off_train_label, off_eval_label = np.split(off_label, [off_data.shape[0] * 9 // 10]) def_train_data, def_eval_data = np.split(def_data, [def_data.shape[0] * 9 // 10]) def_train_label, def_eval_label = np.split(def_label, [def_data.shape[0] * 9 // 10]) print(off_train_data.shape) print(off_eval_data.shape) print(off_train_label.shape) print(off_eval_label.shape) print(def_train_data.shape) print(def_eval_data.shape) print(def_train_label.shape) print(def_eval_label.shape) # graph tf.reset_default_graph() if FLAGS.config == 'offense': model = pretrain_model.PretrainOffense(config) elif FLAGS.config == 'defense': model = pretrain_model.PretrainDefense(config) else: raise ValueError('{} is not an available config'.format(FLAGS.config)) message = 'Graph contains {} trainable variables.' tf.logging.info(message.format(tools.count_weights())) saver = utility.define_saver() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) utility.initialize_variables(sess, saver, config.logdir, resume=True) vis_result(sess, model, off_train_data, off_train_label, def_train_data, def_train_label, os.path.join(outdir, 'train'), 3) vis_result(sess, model, off_eval_data, off_eval_label, def_eval_data, def_eval_label, os.path.join(outdir, 'eval'), 3)
def testing(config, env_processes, outdir): """ testing Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, dummy_env) # TF Session # NOTE: _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume) # testing collect_results(config, sess.run(graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ) batch_env.close()
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ tf.reset_default_graph() if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): # batch_env = utility.define_batch_env( # lambda: _create_environment(config), # config.num_agents, env_processes) config_envs = [] for i in range(config.num_agents): config_envs.append(lambda i: _create_environment2(config,i)) batch_env = utility.define_batch_env2(config_envs, env_processes) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary.*',)) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ tf.reset_default_graph() with config.unlocked: config.network = functools.partial( utility.define_network, config.network, config) config.policy_optimizer = getattr(tf.train, config.policy_optimizer) config.value_optimizer = getattr(tf.train, config.value_optimizer) if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*',)) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def _restore_policy(self, network, policy_layers, value_layers, action_size, checkpoint): """Restore the PPO policy from a TensorFlow checkpoint. Args: network: The neural network definition. policy_layers: A tuple specify the number of layers and number of neurons of each layer for the policy network. value_layers: A tuple specify the number of layers and number of neurons of each layer for the value network. action_size: The dimension of the action space. checkpoint: The checkpoint path. """ observ = self._observ_filter.transform(self.observation_placeholder) with tf.variable_scope("network/rnn"): self.network = network(policy_layers=policy_layers, value_layers=value_layers, action_size=action_size) with tf.variable_scope("temporary"): self.last_state = tf.Variable( self.network.zero_state(1, tf.float32), False) self.sess.run(self.last_state.initializer) with tf.variable_scope("network"): (mean_action, _, _), new_state = tf.nn.dynamic_rnn(self.network, observ[:, None], tf.ones(1), self.last_state, tf.float32, swap_memory=True) self.mean_action = mean_action self.update_state = self.last_state.assign(new_state) saver = utility.define_saver(exclude=(r"temporary/.*", )) saver.restore(self.sess, checkpoint)
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configurati on attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ tf.reset_default_graph() with config.unlocked: config.network = functools.partial(utility.define_network, config.network, config) config.policy_optimizer = getattr(tf.train, config.policy_optimizer) config.value_optimizer = getattr(tf.train, config.value_optimizer) if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes) # graph represents the simulation of a single step in all environments. graph = utility.define_simulation_graph(batch_env, config.algorithm, config) # Each iteration of this loop will move the environments train_steps forward, and # eval_steps forward. The multiplier config.max_length makes sure that after # moving train_steps, at least config.update_every number of episodes are generated. # Note that train_steps is the total number of steps that ALL environments should be # moved forward, and same as eval_steps, which is the total number of steps that # ALL environments should be moved forward. train_steps = config.update_every * config.max_length eval_steps = config.eval_episodes * config.max_length # The loop is like a while loop which loop over mini batches of training episodes and # evaluation episodes. Training episodes and evaluation episodes are monte-carlo simulated. # The process of each mini batch consists of a training over the training episodes # and evaluations of the objective over the evaluation episodes. You can understand the # loop as: # steps_made = 0 # while steps_made < total_steps: # 1. simulate at least config.update_every number of training episodes, increase # steps_made by the steps happened in simulation. # 2. train the model with those training episodes. # 3. simulate at least config.eval_episodes evaluation episodes, increase # steps_made by the steps happened in simulation. # 4. evaluate the model with those evaluation episodes. loop = _define_loop(graph, config.logdir, train_steps=train_steps, eval_steps=eval_steps, batch_env=batch_env) total_steps = int(config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True # with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) loop = _define_loop(graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int(config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary.*', )) if FLAGS.off_ckpt and FLAGS.def_ckpt: # restore both offense and defense pretrain model off_saver = utility.define_saver_with_prefix( exclude=(r'.*d_trunk/.*', r'.*value/.*', r'.*two_trunk_gaussian/.*')) def_saver = utility.define_saver_with_prefix( exclude=(r'.*o_trunk/.*', r'.*value/.*', r'.*two_trunk_gaussian/.*')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=FLAGS.ui_type) utility.initialize_pretrained_variables(sess, off_saver, FLAGS.off_ckpt, def_saver, FLAGS.def_ckpt) for score in loop.run(sess, saver, total_steps): yield score else: sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=FLAGS.ui_type) utility.initialize_variables(sess, saver, config.logdir, resume=FLAGS.resume) print(total_steps) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def train(config, data, label, outdir): """ Training and evaluation entry point yielding scores. Args ---- config : Object providing configurations via attributes. Yields ------ score : Evaluation scores. """ # normalization env = BBallPretrainEnv() min_ = env.observation_space.low max_ = env.observation_space.high data = 2 * (data - min_) / (max_ - min_) - 1 # split into train and eval train_data, eval_data = np.split(data, [data.shape[0] * 9 // 10]) train_label, eval_label = np.split(label, [data.shape[0] * 9 // 10]) print(train_data.shape) print(train_label.shape) print(eval_data.shape) print(eval_label.shape) # graph tf.reset_default_graph() if FLAGS.config == 'offense': model = pretrain_model.PretrainOffense(config) elif FLAGS.config == 'defense': model = pretrain_model.PretrainDefense(config) else: raise ValueError('{} is not an available config'.format(FLAGS.config)) # model = config.model(config) message = 'Graph contains {} trainable variables.' tf.logging.info(message.format(tools.count_weights())) saver = utility.define_saver() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True # summary writter train_writter = tf.summary.FileWriter(os.path.join(config.logdir, 'train'), tf.get_default_graph()) # summary writter eval_writter = tf.summary.FileWriter(os.path.join(config.logdir, 'eval'), tf.get_default_graph()) with tf.Session(config=sess_config) as sess: if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) utility.initialize_variables(sess, saver, config.logdir, resume=FLAGS.resume) for epoch_idx in range(config.num_epochs): tf.logging.info('Number of epochs: {}'.format(epoch_idx)) training(sess, model, train_data, train_label, config, train_writter) evaluating(sess, model, eval_data, eval_label, config, eval_writter) if (epoch_idx + 1) % config.checkpoint_every == 0: tf.gfile.MakeDirs(config.logdir) filename = os.path.join(config.logdir, 'model.ckpt') saver.save(sess, filename, (epoch_idx + 1) * config.batch_size)
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_{}/'.format(config.train_len)), if_back_real=False, video_callable=lambda _: True, # init from dataset init_mode=1) # if not os.path.exists(os.path.join(config.logdir, 'gail_testing')): # os.makedirs(os.path.join(config.logdir, 'gail_testing')) vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy') # env to generate fake state env = gym.make(config.env) env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real, time_limit=config.max_length) env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real, # init from dataset in order init_mode=3) # Discriminator graph with tf.device('/gpu:0'): D = Discriminator(config, dummy_env) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, env) # Data all_data = h5py.File( 'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r') expert_data, valid_expert_data = np.split( all_data['OBS'].value, [all_data['OBS'].value.shape[0]*9//10]) expert_action, valid_expert_action = np.split( all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0]*9//10]) print('expert_data', expert_data.shape) print('valid_expert_data', valid_expert_data.shape) print('expert_action', expert_action.shape) print('valid_expert_action', valid_expert_action.shape) # TF Session # TODO _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, resume=FLAGS.resume) # NOTE reset variables in optimizer D.reset_optimizer(sess) # reset PPO optimizer opt_reset = tf.group( [v.initializer for v in graph.algo._optimizer.variables()]) sess.run(opt_reset) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) exit() # GAIL cumulate_steps = sess.run(graph.step) episode_idx = 0 valid_episode_idx = 0 while True: if episode_idx > (expert_data.shape[0]-config.episodes_per_batch*config.train_d_per_ppo) or episode_idx == 0: episode_idx = 0 perm_idx = np.random.permutation(expert_data.shape[0]) expert_data = expert_data[perm_idx] expert_action = expert_action[perm_idx] if valid_episode_idx > (valid_expert_data.shape[0]-config.episodes_per_batch) or valid_episode_idx == 0: valid_episode_idx = 0 valid_perm_idx = np.random.permutation( valid_expert_data.shape[0]) valid_expert_data = valid_expert_data[valid_perm_idx] valid_expert_action = valid_expert_action[valid_perm_idx] # testing if valid_episode_idx % (100 * config.episodes_per_batch) == 0: test_policy(config, vanilla_env, sess.run(D._global_steps), ppo_policy, D, denormalize_observ) if valid_episode_idx % (1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) # train Discriminator train_Discriminator( episode_idx, config, expert_data, expert_action, env, ppo_policy, D, normalize_observ, normalize_action) if valid_episode_idx % (1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) # valid Discriminator valid_Discriminator( valid_episode_idx, config, valid_expert_data, valid_expert_action, env, ppo_policy, D, normalize_observ, normalize_action) episode_idx += config.episodes_per_batch*config.train_d_per_ppo valid_episode_idx += config.episodes_per_batch # train PPO print('train PPO') cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close() env.close()
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0, 0] max_ = dummy_env.observation_space.high[0, 0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, data=h5py.File('bball_strategies/data/OrderedGAILTransitionData_522.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.max_length, config.D_len)), video_callable=lambda _: True, # init from dataset init_mode=1) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, dummy_env) # summary writer of Discriminator summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator') # TF Session # NOTE: _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume) # NOTE reset variables in optimizer for different stages of curriculum learning opt_reset_D = tf.group( [v.initializer for v in graph.algo.D.optimizer.variables()]) # reset PPO optimizer opt_reset = tf.group( [v.initializer for v in graph.algo._optimizer.variables()]) sess.run([opt_reset, opt_reset_D]) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action) tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action, stochastic=True) exit() # GAIL cumulate_steps = sess.run(graph.step) counter = 0 while True: # train Discriminator gail_timer = time.time() if counter > config.pretrain_d_times: num_d_to_train = config.train_d_per_ppo else: num_d_to_train = config.pretrain_d_per_ppo for _ in range(num_d_to_train): # train D feed_dict = { graph.is_training: True, graph.should_log: True, graph.do_report: True, graph.force_reset: False} gail_counter = 0 while gail_counter < config.gail_steps: gail_summary = sess.run( graph.gail_summary, feed_dict=feed_dict) if gail_summary: summary_writer.add_summary( gail_summary, global_step=sess.run(graph.algo.D._steps)) gail_counter += 1 # testing if counter % (config.vis_testing_freq) == 0: test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ) if counter % (config.tally_line_chart_freq) == 0: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action) tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action, stochastic=True) counter += 1 print('Time Cost of Discriminator per Update: {}'.format( (time.time() - gail_timer) / num_d_to_train)) # train ppo cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close()
def train(agents_config, env_processes=True, log_dir=None): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ FLAGS = tf.app.flags.FLAGS if log_dir is None and hasattr(FLAGS, 'log_dir'): log_dir = FLAGS.log_dir run_config = tf.contrib.learn.RunConfig() _log_run_config(run_config) server = tf.train.Server(run_config.cluster_spec, job_name=run_config.task_type, task_index=run_config.task_id) tf.reset_default_graph() if agents_config.update_every % agents_config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') worker_device = "/job:%s/replica:0/task:%d" % (run_config.task_type, run_config.task_id) with tf.device(worker_device): with tf.device( tf.train.replica_device_setter( worker_device=worker_device, cluster=run_config.cluster_spec)): global_step = tf.Variable(0, False, dtype=tf.int32, name='global_step') batch_env = define_batch_env( lambda: _create_environment(agents_config), agents_config.num_agents, env_processes) optimizer = agents_config.optimizer(agents_config.learning_rate) if FLAGS.sync_replicas: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=(run_config.num_worker_replicas), total_num_replicas=(run_config.num_worker_replicas)) with agents_config.unlocked: agents_config.optimizer = optimizer graph = define_simulation_graph(batch_env, agents_config.algorithm, agents_config, global_step) loop = _define_loop( graph, log_dir, agents_config.update_every * agents_config.max_length, agents_config.eval_episodes * agents_config.max_length) total_steps = int( agents_config.steps / agents_config.update_every * (agents_config.update_every + agents_config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess_config = tf.ConfigProto(allow_soft_placement=True) if FLAGS.log_device_placement: sess_config.log_device_placement = True sess_config.gpu_options.allow_growth = True init_op = tf.global_variables_initializer() local_init_op = tf.local_variables_initializer() hooks = [tf.train.StopAtStepHook(last_step=total_steps)] if FLAGS.sync_replicas: opt = graph.algo._optimizer sync_replicas_hook = opt.make_session_run_hook(run_config.is_chief) hooks.append(sync_replicas_hook) scaffold = tf.train.Scaffold(saver=saver, init_op=init_op, local_init_op=local_init_op) # if FLAGS.sync_replicas: # opt = graph.algo._optimizer # local_init_op = opt.local_step_init_op # if run_config.is_chief: # local_init_op = opt.chief_init_op # # ready_for_local_init_op = opt.ready_for_local_init_op # # # Initial token and chief queue runners required by the sync_replicas mode # chief_queue_runner = opt.get_chief_queue_runner() # sync_init_op = opt.get_init_tokens_op() # if FLAGS.sync_replicas: # sv = tf.train.Supervisor( # is_chief=run_config.is_chief, # logdir=log_dir, # init_op=init_op, # local_init_op=local_init_op, # ready_for_local_init_op=ready_for_local_init_op, # recovery_wait_secs=1, # global_step=global_step) # else: # sv = tf.train.Supervisor( # is_chief=run_config.is_chief, # logdir=log_dir, # init_op=init_op, # recovery_wait_secs=1, # global_step=global_step) # with sv.prepare_or_wait_for_session(server.target, config=sess_config) as sess: # if FLAGS.sync_replicas and is_chief: # # Chief worker will start the chief queue runner and call the init op. # sess.run(sync_init_op) # sv.start_queue_runners(sess, [chief_queue_runner]) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=run_config.is_chief, checkpoint_dir=log_dir, scaffold=scaffold, hooks=hooks, save_checkpoint_secs=FLAGS.save_checkpoint_secs, save_summaries_steps=None, save_summaries_secs=None, config=sess_config, stop_grace_period_secs=120, log_step_count_steps=3000) as sess: global_step = sess.run(loop._step) steps_made = 1 while not sess.should_stop(): phase, epoch, steps_in = loop._find_current_phase(global_step) phase_step = epoch * phase.steps + steps_in if steps_in % phase.steps < steps_made: message = '\n' + ('-' * 50) + '\n' message += 'Phase {} (phase step {}, global step {}).' tf.logging.info( message.format(phase.name, phase_step, global_step)) phase.feed[loop._reset] = (steps_in < steps_made) phase.feed[loop._log] = (phase.writer and loop._is_every_steps( phase_step, phase.batch, phase.log_every)) phase.feed[loop._report] = (loop._is_every_steps( phase_step, phase.batch, phase.report_every)) summary, mean_score, global_step, steps_made = sess.run( phase.op, phase.feed) if loop._is_every_steps( phase_step, phase.batch, phase.checkpoint_every) and run_config.is_chief: loop._store_checkpoint(sess, saver, global_step) if loop._is_every_steps(phase_step, phase.batch, phase.report_every): yield mean_score # TODO: Potentially integrate summary writing with # MonitoredTrainingSession. if summary and phase.writer and run_config.is_chief: # We want smaller phases to catch up at the beginnig of each epoch so # that their graphs are aligned. longest_phase = max(phase.steps for phase in loop._phases) summary_step = epoch * longest_phase + steps_in phase.writer.add_summary(summary, summary_step) batch_env.close()
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.train_len, config.D_len)), if_back_real=False, video_callable=lambda _: True, # init from dataset init_mode=1) vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy') # env to generate fake state env = gym.make(config.env) env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real, time_limit=config.max_length) env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real, # init from dataset in order init_mode=3) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, env) # Data all_data = h5py.File( 'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r') expert_data, valid_expert_data = np.split( all_data['OBS'].value, [all_data['OBS'].value.shape[0] * 9 // 10]) expert_action, valid_expert_action = np.split( all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0] * 9 // 10]) print('expert_data', expert_data.shape) print('valid_expert_data', valid_expert_data.shape) print('expert_action', expert_action.shape) print('valid_expert_action', valid_expert_action.shape) # Preprocessing/ Normalization expert_data = normalize_observ(expert_data) valid_expert_data = normalize_observ(valid_expert_data) expert_action = normalize_action(expert_action) valid_expert_action = normalize_action(valid_expert_action) # summary writer of Discriminator summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator') # TF Session # TODO _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, resume=FLAGS.resume) # NOTE reset variables in optimizer # opt_reset_D = tf.group( # [v.initializer for v in graph.algo.D.optimizer.variables()]) # # reset PPO optimizer # opt_reset = tf.group( # [v.initializer for v in graph.algo._optimizer.variables()]) # sess.run([opt_reset, opt_reset_D]) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) exit() # GAIL cumulate_steps = sess.run(graph.step) episode_idx = 0 while True: if episode_idx > (expert_data.shape[0] - config.episodes_per_batch * config.train_d_per_ppo) or episode_idx == 0: episode_idx = 0 perm_idx = np.random.permutation(expert_data.shape[0]) expert_data = expert_data[perm_idx] expert_action = expert_action[perm_idx] # # testing if episode_idx % (config.train_d_per_ppo * 100 * config.episodes_per_batch) == 0: test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ) if episode_idx % (config.train_d_per_ppo * 1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ, normalize_observ, normalize_action) # # train Discriminator gail_timer = time.time() for _ in range(config.train_d_per_ppo): if config.is_double_curiculum: observ = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:] action = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1] if config.use_padding: # 1. padding with buffer buffer = observ[:, 0, :-1] padded_observ = np.concatenate([buffer, observ[:, :, -1]], axis=1) padded_act = np.concatenate([np.zeros(shape=[action.shape[0], 9, 5, 2]), action], axis=1) # 2. split the whole episode into training data of Discriminator with length=config.D_len training_obs = [] training_act = [] for i in range(config.max_length-config.D_len+10): training_obs.append(padded_observ[:, i:i+config.D_len]) training_act.append(padded_act[:, i:i+config.D_len]) training_obs = np.concatenate(training_obs, axis=0) training_act = np.concatenate(training_act, axis=0) else: pass else: training_obs = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:, -1] training_act = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1] feed_dict = { graph.is_training: True, graph.should_log: True, graph.do_report: True, graph.force_reset: False, graph.algo.D._expert_s: training_obs, graph.algo.D._expert_a: training_act} gail_counter = 0 while gail_counter < config.gail_steps: gail_summary = sess.run( graph.gail_summary, feed_dict=feed_dict) if gail_summary: summary_writer.add_summary( gail_summary, global_step=sess.run(graph.algo.D._steps)) gail_counter += 1 episode_idx += config.episodes_per_batch print('Time Cost of Discriminator per Update: {}'.format( (time.time() - gail_timer) / config.train_d_per_ppo)) # train ppo cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close() env.close()