def train(config, env_processes, logdir): tf.reset_default_graph() sess = tf.Session() previous_stage_logdir = os.path.join(logdir, "optimal_policy") stage_logdir = os.path.join(logdir, "sf_repres") tf.gfile.MakeDirs(stage_logdir) with sess: with tf.device("/cpu:0"): with config.unlocked: config.logdir = logdir config.stage_logdir = stage_logdir config.network_optimizer = getattr(tf.train, config.network_optimizer) global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) envs = [ _create_environment(config) for _ in range(config.num_agents) ] action_size = envs[0].action_space.n global_network = config.network("global", config, action_size, 2) agents = [ config.sf_agent(envs[i], i, global_step, config) for i in range(config.num_agents) ] saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) loader = utility.define_saver(exclude=(r'.*_temporary/.*', r'.*sf/.*')) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state( os.path.join(previous_stage_logdir, "models")) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) loader.restore(sess, ckpt.model_checkpoint_path) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() agent_threads = [] for agent in agents: thread = threading.Thread( target=(lambda: agent.play(sess, coord, saver))) thread.start() agent_threads.append(thread) while True: if FLAGS.show_training: for env in envs: env.render() coord.join(agent_threads)
def train(config, env_processes, logdir): tf.reset_default_graph() sess = tf.Session() previous_stage_logdir = os.path.join(logdir, "sf_repres") matrix_stage_logdir = os.path.join(logdir, "sf_matrix") stage_logdir = os.path.join(logdir, "plot_sf_policy") tf.gfile.MakeDirs(stage_logdir) with sess: with tf.device("/cpu:0"): with config.unlocked: config.logdir = logdir config.stage_logdir = stage_logdir config.matrix_stage_logdir = matrix_stage_logdir eval, evect = get_direction(config.matrix_stage_logdir) config.network_optimizer = getattr(tf.train, config.network_optimizer) global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) env = _create_environment(config) action_size = env.action_space.n global_network = config.network("global", config, action_size, 5) global_network.option = FLAGS.option agent = config.option_agent(env, 0, global_step, config, FLAGS.option, eval, evect, FLAGS.flip_eigen, 5) saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) loader = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state( os.path.join(previous_stage_logdir, "models")) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) loader.restore(sess, ckpt.model_checkpoint_path) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() agent_threads = [] thread = threading.Thread( target=(lambda: agent.plot_heatmap(sess, coord, saver))) thread.start() agent_threads.append(thread) coord.join(agent_threads)
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ print(config) print("train ==================================================== ") tf.reset_default_graph() if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): print("1 ==================================================== ") batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes=False) print("2 ==================================================== ") graph = utility.define_simulation_graph(batch_env, config.algorithm, config) print("3 ==================================================== ") loop = _define_loop(graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) print("4 ==================================================== ") total_steps = int(config.steps / config.update_every * (config.update_every + config.eval_episodes)) print("5 ==================================================== ") # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True print("session ==================================================== ") with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) #saver.restore(sess, "ant/20200818T151509-pybullet_ant/model.ckpt-2400000") for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def train(config, logdir): tf.reset_default_graph() sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) stage_logdir = os.path.join(logdir, "dif") tf.gfile.MakeDirs(stage_logdir) with sess: # with tf.device("/{}:0".format("gpu" if config.agent_type == "dqn" and len(get_available_gpus()) > 0 else "cpu")): with config.unlocked: config.logdir = logdir config.stage_logdir = stage_logdir config.network_optimizer = getattr(tf.train, config.network_optimizer) agents = initialize_agents(config) # variables_to_load = get_list_vars_load() exclude = None if not FLAGS.resume_option: exclude = (r'.*/Q/.*', ) loader = utility.define_saver(exclude=exclude) saver = utility.define_saver() if FLAGS.resume: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.join(FLAGS.load_from, "dif"), "models")) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) loader.restore(sess, ckpt.model_checkpoint_path) sess.run(tf.local_variables_initializer()) else: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) coord = tf.train.Coordinator() agent_threads = start_agents(agents, config, coord, sess, saver) coord.join(agent_threads)
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ tf.reset_default_graph() if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env_multi( lambda: _create_environment(config), lambda: _create_environment_IF_multi_nosie(config), num_agents=config.num_agents, env_processes=env_processes) #batch_env = utility.define_batch_env_multi(lambda: _create_environment(config), num_agents=config.num_agents, # env_processes=env_processes) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) """graph can be treated as a dictionary, containing""" loop = _define_loop(graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int(config.steps / config.update_every * (config.update_every + config.eval_episodes)) # print('total_steps is:',total_steps) #total_steps is: 20000000 # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()
def train(config, env_processes, logdir): tf.reset_default_graph() sess = tf.Session() stage_logdir = os.path.join(logdir, "linear_sf") tf.gfile.MakeDirs(stage_logdir) with sess: with tf.device("/cpu:0"): with config.unlocked: config.logdir = logdir config.stage_logdir = stage_logdir config.network_optimizer = getattr(tf.train, config.network_optimizer) global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) envs = [_create_environment(config) for _ in range(config.num_agents)] action_size = envs[0].action_space.n nb_states = envs[0].nb_states global_network = config.network("global", config, action_size, nb_states) if FLAGS.task == "matrix": agent = config.linear_sf_agent(envs[0], 0, global_step, config) else: agents = [config.linear_sf_agent(envs[i], i, global_step, config) for i in range(config.num_agents)] saver = loader = utility.define_saver(exclude=(r'.*_temporary/.*',)) if FLAGS.resume: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.join(FLAGS.load_from, "linear_sf"), "models")) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) loader.restore(sess, ckpt.model_checkpoint_path) sess.run(tf.local_variables_initializer()) else: sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) coord = tf.train.Coordinator() agent_threads = [] if FLAGS.task == "matrix": thread = threading.Thread(target=(lambda: agent.build_matrix1(sess, coord, saver))) thread.start() agent_threads.append(thread) else: for agent in agents: thread = threading.Thread(target=(lambda: agent.play(sess, coord, saver))) thread.start() agent_threads.append(thread) coord.join(agent_threads)
def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config, outdir), num_agents, env_processes=False) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: #utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True) utility.initialize_variables(sess, saver, logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def _restore_policy(self, network, policy_layers, value_layers, action_size, checkpoint): """Restore the PPO policy from a TensorFlow checkpoint. Args: network: The neural network definition. policy_layers: A tuple specify the number of layers and number of neurons of each layer for the policy network. value_layers: A tuple specify the number of layers and number of neurons of each layer for the value network. action_size: The dimension of the action space. checkpoint: The checkpoint path. """ observ = self._observ_filter.transform(self.observation_placeholder) with tf.variable_scope("network/rnn"): self.network = network(policy_layers=policy_layers, value_layers=value_layers, action_size=action_size) with tf.variable_scope("temporary"): self.last_state = tf.Variable( self.network.zero_state(1, tf.float32), False) self.sess.run(self.last_state.initializer) with tf.variable_scope("network"): (mean_action, _, _), new_state = tf.nn.dynamic_rnn(self.network, observ[:, None], tf.ones(1), self.last_state, tf.float32, swap_memory=True) self.mean_action = mean_action self.update_state = self.last_state.assign(new_state) saver = utility.define_saver(exclude=(r"temporary/.*", )) saver.restore(self.sess, checkpoint)