def train(config, env_processes, logdir):
    tf.reset_default_graph()
    sess = tf.Session()
    previous_stage_logdir = os.path.join(logdir, "optimal_policy")
    stage_logdir = os.path.join(logdir, "sf_repres")
    tf.gfile.MakeDirs(stage_logdir)
    with sess:
        with tf.device("/cpu:0"):
            with config.unlocked:
                config.logdir = logdir
                config.stage_logdir = stage_logdir
                config.network_optimizer = getattr(tf.train,
                                                   config.network_optimizer)
                global_step = tf.Variable(0,
                                          dtype=tf.int32,
                                          name='global_step',
                                          trainable=False)
                envs = [
                    _create_environment(config)
                    for _ in range(config.num_agents)
                ]
                action_size = envs[0].action_space.n
                global_network = config.network("global", config, action_size,
                                                2)
                agents = [
                    config.sf_agent(envs[i], i, global_step, config)
                    for i in range(config.num_agents)
                ]

            saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
            loader = utility.define_saver(exclude=(r'.*_temporary/.*',
                                                   r'.*sf/.*'))
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(
                os.path.join(previous_stage_logdir, "models"))
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            loader.restore(sess, ckpt.model_checkpoint_path)
            sess.run(tf.local_variables_initializer())

            coord = tf.train.Coordinator()

            agent_threads = []
            for agent in agents:
                thread = threading.Thread(
                    target=(lambda: agent.play(sess, coord, saver)))
                thread.start()
                agent_threads.append(thread)

            while True:
                if FLAGS.show_training:
                    for env in envs:
                        env.render()

            coord.join(agent_threads)
Ejemplo n.º 2
0
def train(config, env_processes, logdir):
    tf.reset_default_graph()
    sess = tf.Session()
    previous_stage_logdir = os.path.join(logdir, "sf_repres")
    matrix_stage_logdir = os.path.join(logdir, "sf_matrix")
    stage_logdir = os.path.join(logdir, "plot_sf_policy")
    tf.gfile.MakeDirs(stage_logdir)
    with sess:
        with tf.device("/cpu:0"):
            with config.unlocked:
                config.logdir = logdir
                config.stage_logdir = stage_logdir
                config.matrix_stage_logdir = matrix_stage_logdir
                eval, evect = get_direction(config.matrix_stage_logdir)
                config.network_optimizer = getattr(tf.train,
                                                   config.network_optimizer)
                global_step = tf.Variable(0,
                                          dtype=tf.int32,
                                          name='global_step',
                                          trainable=False)
                env = _create_environment(config)
                action_size = env.action_space.n
                global_network = config.network("global", config, action_size,
                                                5)
                global_network.option = FLAGS.option
                agent = config.option_agent(env, 0, global_step, config,
                                            FLAGS.option, eval, evect,
                                            FLAGS.flip_eigen, 5)

            saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
            loader = utility.define_saver(exclude=(r'.*_temporary/.*', ))
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(
                os.path.join(previous_stage_logdir, "models"))
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            loader.restore(sess, ckpt.model_checkpoint_path)
            sess.run(tf.local_variables_initializer())

            coord = tf.train.Coordinator()

            agent_threads = []
            thread = threading.Thread(
                target=(lambda: agent.plot_heatmap(sess, coord, saver)))
            thread.start()
            agent_threads.append(thread)

            coord.join(agent_threads)
Ejemplo n.º 3
0
def train(config, env_processes):
    """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """

    print(config)

    print("train ==================================================== ")
    tf.reset_default_graph()
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')

    with tf.device('/cpu:0'):
        print("1 ==================================================== ")
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents,
            env_processes=False)
        print("2 ==================================================== ")
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        print("3 ==================================================== ")
        loop = _define_loop(graph, config.logdir,
                            config.update_every * config.max_length,
                            config.eval_episodes * config.max_length)
        print("4 ==================================================== ")
        total_steps = int(config.steps / config.update_every *
                          (config.update_every + config.eval_episodes))
        print("5 ==================================================== ")
    # Exclude episode related variables since the Python state of environments is
    # not checkpointed and thus new episodes start after resuming.
    saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    print("session ==================================================== ")

    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(sess, saver, config.logdir)
        #saver.restore(sess, "ant/20200818T151509-pybullet_ant/model.ckpt-2400000")
        for score in loop.run(sess, saver, total_steps):
            yield score
    batch_env.close()
Ejemplo n.º 4
0
def train(config, logdir):
    tf.reset_default_graph()
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    stage_logdir = os.path.join(logdir, "dif")
    tf.gfile.MakeDirs(stage_logdir)
    with sess:
        # with tf.device("/{}:0".format("gpu" if config.agent_type == "dqn" and len(get_available_gpus()) > 0 else "cpu")):
        with config.unlocked:
            config.logdir = logdir
            config.stage_logdir = stage_logdir
            config.network_optimizer = getattr(tf.train,
                                               config.network_optimizer)
            agents = initialize_agents(config)

        # variables_to_load = get_list_vars_load()
        exclude = None
        if not FLAGS.resume_option:
            exclude = (r'.*/Q/.*', )
        loader = utility.define_saver(exclude=exclude)
        saver = utility.define_saver()
        if FLAGS.resume:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(
                os.path.join(os.path.join(FLAGS.load_from, "dif"), "models"))
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            loader.restore(sess, ckpt.model_checkpoint_path)
            sess.run(tf.local_variables_initializer())
        else:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer()
            ])

        coord = tf.train.Coordinator()
        agent_threads = start_agents(agents, config, coord, sess, saver)
        coord.join(agent_threads)
Ejemplo n.º 5
0
def train(config, env_processes):
    """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """
    tf.reset_default_graph()
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env_multi(
            lambda: _create_environment(config),
            lambda: _create_environment_IF_multi_nosie(config),
            num_agents=config.num_agents,
            env_processes=env_processes)

        #batch_env = utility.define_batch_env_multi(lambda: _create_environment(config), num_agents=config.num_agents,
        #                                          env_processes=env_processes)
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        """graph can be treated as a dictionary, containing"""

        loop = _define_loop(graph, config.logdir,
                            config.update_every * config.max_length,
                            config.eval_episodes * config.max_length)

        total_steps = int(config.steps / config.update_every *
                          (config.update_every + config.eval_episodes))
        # print('total_steps is:',total_steps)
        #total_steps is: 20000000

    # Exclude episode related variables since the Python state of environments is
    # not checkpointed and thus new episodes start after resuming.
    saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(sess, saver, config.logdir)
        for score in loop.run(sess, saver, total_steps):
            yield score
    batch_env.close()
def train(config, env_processes, logdir):
  tf.reset_default_graph()
  sess = tf.Session()
  stage_logdir = os.path.join(logdir, "linear_sf")
  tf.gfile.MakeDirs(stage_logdir)
  with sess:
    with tf.device("/cpu:0"):
      with config.unlocked:
        config.logdir = logdir
        config.stage_logdir = stage_logdir
        config.network_optimizer = getattr(tf.train, config.network_optimizer)
        global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False)
        envs = [_create_environment(config) for _ in range(config.num_agents)]
        action_size = envs[0].action_space.n
        nb_states = envs[0].nb_states
        global_network = config.network("global", config, action_size, nb_states)
        if FLAGS.task == "matrix":
          agent = config.linear_sf_agent(envs[0], 0, global_step, config)
        else:
          agents = [config.linear_sf_agent(envs[i], i, global_step, config) for i in range(config.num_agents)]

      saver = loader = utility.define_saver(exclude=(r'.*_temporary/.*',))
      if FLAGS.resume:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.join(FLAGS.load_from, "linear_sf"), "models"))
        print("Loading Model from {}".format(ckpt.model_checkpoint_path))
        loader.restore(sess, ckpt.model_checkpoint_path)
        sess.run(tf.local_variables_initializer())
      else:
        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

      coord = tf.train.Coordinator()

      agent_threads = []
      if FLAGS.task == "matrix":
        thread = threading.Thread(target=(lambda: agent.build_matrix1(sess, coord, saver)))
        thread.start()
        agent_threads.append(thread)
      else:
        for agent in agents:
          thread = threading.Thread(target=(lambda: agent.play(sess, coord, saver)))
          thread.start()
          agent_threads.append(thread)

      coord.join(agent_threads)
Ejemplo n.º 7
0
def visualize(logdir,
              outdir,
              num_agents,
              num_episodes,
              checkpoint=None,
              env_processes=True):
    """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """

    config = utility.load_config(logdir)

    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config, outdir),
            num_agents,
            env_processes=False)
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        total_steps = num_episodes * config.max_length
        loop = _define_loop(graph, total_steps)

    saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        #utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True)
        utility.initialize_variables(sess,
                                     saver,
                                     logdir,
                                     checkpoint,
                                     resume=True)
        for unused_score in loop.run(sess, saver, total_steps):
            pass
    batch_env.close()
Ejemplo n.º 8
0
    def _restore_policy(self, network, policy_layers, value_layers,
                        action_size, checkpoint):
        """Restore the PPO policy from a TensorFlow checkpoint.

    Args:
      network: The neural network definition.
      policy_layers: A tuple specify the number of layers and number of neurons
        of each layer for the policy network.
      value_layers: A tuple specify the number of layers and number of neurons
        of each layer for the value network.
      action_size: The dimension of the action space.
      checkpoint: The checkpoint path.
    """
        observ = self._observ_filter.transform(self.observation_placeholder)
        with tf.variable_scope("network/rnn"):
            self.network = network(policy_layers=policy_layers,
                                   value_layers=value_layers,
                                   action_size=action_size)

        with tf.variable_scope("temporary"):
            self.last_state = tf.Variable(
                self.network.zero_state(1, tf.float32), False)
            self.sess.run(self.last_state.initializer)

        with tf.variable_scope("network"):
            (mean_action, _,
             _), new_state = tf.nn.dynamic_rnn(self.network,
                                               observ[:, None],
                                               tf.ones(1),
                                               self.last_state,
                                               tf.float32,
                                               swap_memory=True)
            self.mean_action = mean_action
            self.update_state = self.last_state.assign(new_state)

        saver = utility.define_saver(exclude=(r"temporary/.*", ))
        saver.restore(self.sess, checkpoint)