Python define_saver Exemples, agents.scripts.utility.define_saver Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : visualize.py Projet : shamanez/agents

def visualize(
    logdir, outdir, num_agents, num_episodes, checkpoint=None,
    env_processes=True):
  """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
  config = utility.load_config(logdir)
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(
        lambda: _create_environment(config, outdir),
        num_agents, env_processes)
    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
  saver = utility.define_saver(
      exclude=(r'.*_temporary.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(
        sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()

Exemple #2

0

Afficher le fichier

Fichier : benchmark_dr.py Projet : wx-b/Time-in-State-RL

def run_setting(config, checkpoint, sampling_index,latency_index,num_episodes):

    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(lambda: _create_environment(config, sampling_index, latency_index),num_agents=1, env_processes=False)
        #batch_env.sampling_interval= sampling_interval
        #batch_env.latency = latency

        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        total_steps = num_episodes * config.max_length
        loop = _define_loop(graph, total_steps)

    saver = utility.define_saver(exclude=(r'.*_temporary.*', r'global_step'))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, checkpoint, resume=True)
        for unused_score in loop.run(sess, saver, total_steps):
          pass

    tf.reset_default_graph()

    return batch_env.rewards_list

Exemple #3

0

Afficher le fichier

Fichier : visualize.py Projet : snake3355/agents

def visualize(
    logdir, outdir, num_agents, num_episodes, checkpoint=None,
    env_processes=True):
  """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
  config = utility.load_config(logdir)
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(
        lambda: _create_environment(config, outdir),
        num_agents, env_processes)
    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
  saver = utility.define_saver(
      exclude=(r'.*_temporary/.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(
        sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()

Exemple #4

0

Afficher le fichier

def testing(config, off_data, off_label, def_data, def_label, outdir):
    """

    Args
    ----
    config : Object providing configurations via attributes.

    Yields
    ------
    score : Evaluation scores.
    """
    # split into train and eval
    off_train_data, off_eval_data = np.split(off_data,
                                             [off_data.shape[0] * 9 // 10])
    off_train_label, off_eval_label = np.split(off_label,
                                               [off_data.shape[0] * 9 // 10])
    def_train_data, def_eval_data = np.split(def_data,
                                             [def_data.shape[0] * 9 // 10])
    def_train_label, def_eval_label = np.split(def_label,
                                               [def_data.shape[0] * 9 // 10])
    print(off_train_data.shape)
    print(off_eval_data.shape)
    print(off_train_label.shape)
    print(off_eval_label.shape)
    print(def_train_data.shape)
    print(def_eval_data.shape)
    print(def_train_label.shape)
    print(def_eval_label.shape)

    # graph
    tf.reset_default_graph()
    if FLAGS.config == 'offense':
        model = pretrain_model.PretrainOffense(config)
    elif FLAGS.config == 'defense':
        model = pretrain_model.PretrainDefense(config)
    else:
        raise ValueError('{} is not an available config'.format(FLAGS.config))

    message = 'Graph contains {} trainable variables.'
    tf.logging.info(message.format(tools.count_weights()))
    saver = utility.define_saver()
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True

    with tf.Session(config=sess_config) as sess:
        if FLAGS.debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                        ui_type=FLAGS.ui_type)
        utility.initialize_variables(sess, saver, config.logdir, resume=True)
        vis_result(sess, model, off_train_data, off_train_label,
                   def_train_data, def_train_label,
                   os.path.join(outdir, 'train'), 3)
        vis_result(sess, model, off_eval_data, off_eval_label, def_eval_data,
                   def_eval_label, os.path.join(outdir, 'eval'), 3)

Exemple #5

0

Afficher le fichier

def testing(config, env_processes, outdir):
    """ testing

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, dummy_env)
    # TF Session
    # NOTE: _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume)
        # testing
        collect_results(config, sess.run(graph.algo.D._steps), ppo_policy,
                        graph.algo.D, denormalize_observ)
    batch_env.close()

Exemple #6

0

Afficher le fichier

def train(config, env_processes):
  """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """
  tf.reset_default_graph()
  if config.update_every % config.num_agents:
    tf.logging.warn('Number of agents should divide episodes per update.')
  with tf.device('/cpu:0'):
    # batch_env = utility.define_batch_env(
    #     lambda: _create_environment(config),
    #     config.num_agents, env_processes)

    config_envs = []
    for i in range(config.num_agents):
        config_envs.append(lambda i: _create_environment2(config,i))

    batch_env =  utility.define_batch_env2(config_envs, env_processes)


    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    loop = _define_loop(
        graph, config.logdir,
        config.update_every * config.max_length,
        config.eval_episodes * config.max_length)
    total_steps = int(
        config.steps / config.update_every *
        (config.update_every + config.eval_episodes))


  # Exclude episode related variables since the Python state of environments is
  # not checkpointed and thus new episodes start after resuming.
  saver = utility.define_saver(exclude=(r'.*_temporary.*',))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(sess, saver, config.logdir)
    for score in loop.run(sess, saver, total_steps):
      yield score
  batch_env.close()

Exemple #7

0

Afficher le fichier

Fichier : train_ppo.py Projet : hanaa-mohamed/crown

def train(config, env_processes):
  """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """
  tf.reset_default_graph()
  with config.unlocked:
    config.network = functools.partial(
        utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  if config.update_every % config.num_agents:
    tf.logging.warn('Number of agents should divide episodes per update.')
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(
        lambda: _create_environment(config),
        config.num_agents, env_processes)
    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    loop = _define_loop(
        graph, config.logdir,
        config.update_every * config.max_length,
        config.eval_episodes * config.max_length)
    total_steps = int(
        config.steps / config.update_every *
        (config.update_every + config.eval_episodes))
  # Exclude episode related variables since the Python state of environments is
  # not checkpointed and thus new episodes start after resuming.
  saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(sess, saver, config.logdir)
    for score in loop.run(sess, saver, total_steps):
      yield score
  batch_env.close()

Exemple #8

0

Afficher le fichier

Fichier : simple_ppo_agent.py Projet : amitbiswas26/rex-gym

    def _restore_policy(self, network, policy_layers, value_layers,
                        action_size, checkpoint):
        """Restore the PPO policy from a TensorFlow checkpoint.

    Args:
      network: The neural network definition.
      policy_layers: A tuple specify the number of layers and number of neurons
        of each layer for the policy network.
      value_layers: A tuple specify the number of layers and number of neurons
        of each layer for the value network.
      action_size: The dimension of the action space.
      checkpoint: The checkpoint path.
    """
        observ = self._observ_filter.transform(self.observation_placeholder)
        with tf.variable_scope("network/rnn"):
            self.network = network(policy_layers=policy_layers,
                                   value_layers=value_layers,
                                   action_size=action_size)

        with tf.variable_scope("temporary"):
            self.last_state = tf.Variable(
                self.network.zero_state(1, tf.float32), False)
            self.sess.run(self.last_state.initializer)

        with tf.variable_scope("network"):
            (mean_action, _,
             _), new_state = tf.nn.dynamic_rnn(self.network,
                                               observ[:, None],
                                               tf.ones(1),
                                               self.last_state,
                                               tf.float32,
                                               swap_memory=True)
            self.mean_action = mean_action
            self.update_state = self.last_state.assign(new_state)

        saver = utility.define_saver(exclude=(r"temporary/.*", ))
        saver.restore(self.sess, checkpoint)

Exemple #9

0

Afficher le fichier

def train(config, env_processes):
    """Training and evaluation entry point yielding scores.

  Resolves some configurati on attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """
    tf.reset_default_graph()
    with config.unlocked:
        config.network = functools.partial(utility.define_network,
                                           config.network, config)
        config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
        config.value_optimizer = getattr(tf.train, config.value_optimizer)
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):

        batch_env = utility.define_batch_env(
            lambda: _create_environment(config), config.num_agents,
            env_processes)

        # graph represents the simulation of a single step in all environments.
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)

        # Each iteration of this loop will move the environments train_steps forward, and
        # eval_steps forward. The multiplier config.max_length makes sure that after
        # moving train_steps, at least config.update_every number of episodes are generated.
        # Note that train_steps is the total number of steps that ALL environments should be
        # moved forward, and same as eval_steps, which is the total number of steps that
        # ALL environments should be moved forward.
        train_steps = config.update_every * config.max_length
        eval_steps = config.eval_episodes * config.max_length

        # The loop is like a while loop which loop over mini batches of training episodes and
        # evaluation episodes. Training episodes and evaluation episodes are monte-carlo simulated.
        # The process of each mini batch consists of a training over the training episodes
        # and evaluations of the objective over the evaluation episodes. You can understand the
        # loop as:
        # steps_made = 0
        # while steps_made < total_steps:
        #    1. simulate at least config.update_every number of training episodes, increase
        #       steps_made by the steps happened in simulation.
        #    2. train the model with those training episodes.
        #    3. simulate at least config.eval_episodes evaluation episodes, increase
        #       steps_made by the steps happened in simulation.
        #    4. evaluate the model with those evaluation episodes.
        loop = _define_loop(graph,
                            config.logdir,
                            train_steps=train_steps,
                            eval_steps=eval_steps,
                            batch_env=batch_env)

        total_steps = int(config.steps / config.update_every *
                          (config.update_every + config.eval_episodes))

    # Exclude episode related variables since the Python state of environments is
    # not checkpointed and thus new episodes start after resuming.
    saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    #
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(sess, saver, config.logdir)
        for score in loop.run(sess, saver, total_steps):
            yield score
    batch_env.close()

Exemple #10

0

Afficher le fichier

Fichier : train.py Projet : chychen/RL_strategies

def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents,
            env_processes,
            outdir=outdir)
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        loop = _define_loop(graph, config.logdir,
                            config.update_every * config.max_length,
                            config.eval_episodes * config.max_length)
        total_steps = int(config.steps / config.update_every *
                          (config.update_every + config.eval_episodes))
    # Exclude episode related variables since the Python state of environments is
    # not checkpointed and thus new episodes start after resuming.
    saver = utility.define_saver(exclude=(r'.*_temporary.*', ))
    if FLAGS.off_ckpt and FLAGS.def_ckpt:
        # restore both offense and defense pretrain model
        off_saver = utility.define_saver_with_prefix(
            exclude=(r'.*d_trunk/.*', r'.*value/.*',
                     r'.*two_trunk_gaussian/.*'))
        def_saver = utility.define_saver_with_prefix(
            exclude=(r'.*o_trunk/.*', r'.*value/.*',
                     r'.*two_trunk_gaussian/.*'))
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=config.log_device_placement)
        sess_config.gpu_options.allow_growth = True
        with tf.Session(config=sess_config) as sess:
            if FLAGS.debug:
                sess = tf_debug.LocalCLIDebugWrapperSession(
                    sess, ui_type=FLAGS.ui_type)
            utility.initialize_pretrained_variables(sess, off_saver,
                                                    FLAGS.off_ckpt, def_saver,
                                                    FLAGS.def_ckpt)
            for score in loop.run(sess, saver, total_steps):
                yield score

    else:
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=config.log_device_placement)
        sess_config.gpu_options.allow_growth = True
        with tf.Session(config=sess_config) as sess:
            if FLAGS.debug:
                sess = tf_debug.LocalCLIDebugWrapperSession(
                    sess, ui_type=FLAGS.ui_type)
            utility.initialize_variables(sess,
                                         saver,
                                         config.logdir,
                                         resume=FLAGS.resume)
            print(total_steps)
            for score in loop.run(sess, saver, total_steps):
                yield score
    batch_env.close()

Exemple #11

0

Afficher le fichier

Fichier : train.py Projet : chychen/RL_strategies

def train(config, data, label, outdir):
    """ Training and evaluation entry point yielding scores.

    Args
    ----
    config : Object providing configurations via attributes.

    Yields
    ------
    score : Evaluation scores.
    """
    # normalization
    env = BBallPretrainEnv()
    min_ = env.observation_space.low
    max_ = env.observation_space.high
    data = 2 * (data - min_) / (max_ - min_) - 1
    # split into train and eval
    train_data, eval_data = np.split(data, [data.shape[0] * 9 // 10])
    train_label, eval_label = np.split(label, [data.shape[0] * 9 // 10])
    print(train_data.shape)
    print(train_label.shape)
    print(eval_data.shape)
    print(eval_label.shape)

    # graph
    tf.reset_default_graph()
    if FLAGS.config == 'offense':
        model = pretrain_model.PretrainOffense(config)
    elif FLAGS.config == 'defense':
        model = pretrain_model.PretrainDefense(config)
    else:
        raise ValueError('{} is not an available config'.format(FLAGS.config))
    # model = config.model(config)
    message = 'Graph contains {} trainable variables.'
    tf.logging.info(message.format(tools.count_weights()))
    saver = utility.define_saver()
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    # summary writter
    train_writter = tf.summary.FileWriter(os.path.join(config.logdir, 'train'),
                                          tf.get_default_graph())
    # summary writter
    eval_writter = tf.summary.FileWriter(os.path.join(config.logdir, 'eval'),
                                         tf.get_default_graph())
    with tf.Session(config=sess_config) as sess:
        if FLAGS.debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                        ui_type=FLAGS.ui_type)
        utility.initialize_variables(sess,
                                     saver,
                                     config.logdir,
                                     resume=FLAGS.resume)
        for epoch_idx in range(config.num_epochs):
            tf.logging.info('Number of epochs: {}'.format(epoch_idx))
            training(sess, model, train_data, train_label, config,
                     train_writter)
            evaluating(sess, model, eval_data, eval_label, config,
                       eval_writter)
            if (epoch_idx + 1) % config.checkpoint_every == 0:
                tf.gfile.MakeDirs(config.logdir)
                filename = os.path.join(config.logdir, 'model.ckpt')
                saver.save(sess, filename, (epoch_idx + 1) * config.batch_size)

Exemple #12

0

Afficher le fichier

def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False,
                               time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_{}/'.format(config.train_len)), if_back_real=False, video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)
    # if not os.path.exists(os.path.join(config.logdir, 'gail_testing')):
    #     os.makedirs(os.path.join(config.logdir, 'gail_testing'))
    vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy')
    # env to generate fake state
    env = gym.make(config.env)
    env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real,
                       time_limit=config.max_length)
    env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real,
                         # init from dataset in order
                         init_mode=3)
    # Discriminator graph
    with tf.device('/gpu:0'):
        D = Discriminator(config, dummy_env)
    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, env)
    # Data
    all_data = h5py.File(
        'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r')
    expert_data, valid_expert_data = np.split(
        all_data['OBS'].value, [all_data['OBS'].value.shape[0]*9//10])
    expert_action, valid_expert_action = np.split(
        all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0]*9//10])
    print('expert_data', expert_data.shape)
    print('valid_expert_data', valid_expert_data.shape)
    print('expert_action', expert_action.shape)
    print('valid_expert_action', valid_expert_action.shape)

    # TF Session
    # TODO _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, resume=FLAGS.resume)
        # NOTE reset variables in optimizer
        D.reset_optimizer(sess)
        # reset PPO optimizer
        opt_reset = tf.group(
            [v.initializer for v in graph.algo._optimizer.variables()])
        sess.run(opt_reset)
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            exit()
        # GAIL
        cumulate_steps = sess.run(graph.step)
        episode_idx = 0
        valid_episode_idx = 0
        while True:
            if episode_idx > (expert_data.shape[0]-config.episodes_per_batch*config.train_d_per_ppo) or episode_idx == 0:
                episode_idx = 0
                perm_idx = np.random.permutation(expert_data.shape[0])
                expert_data = expert_data[perm_idx]
                expert_action = expert_action[perm_idx]
            if valid_episode_idx > (valid_expert_data.shape[0]-config.episodes_per_batch) or valid_episode_idx == 0:
                valid_episode_idx = 0
                valid_perm_idx = np.random.permutation(
                    valid_expert_data.shape[0])
                valid_expert_data = valid_expert_data[valid_perm_idx]
                valid_expert_action = valid_expert_action[valid_perm_idx]
            # testing
            if valid_episode_idx % (100 * config.episodes_per_batch) == 0:
                test_policy(config, vanilla_env, sess.run(D._global_steps), ppo_policy,
                            D, denormalize_observ)
            if valid_episode_idx % (1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            # train Discriminator
            train_Discriminator(
                episode_idx, config, expert_data, expert_action, env, ppo_policy, D, normalize_observ, normalize_action)
            if valid_episode_idx % (1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            # valid Discriminator
            valid_Discriminator(
                valid_episode_idx, config, valid_expert_data, valid_expert_action, env, ppo_policy, D, normalize_observ, normalize_action)
            episode_idx += config.episodes_per_batch*config.train_d_per_ppo
            valid_episode_idx += config.episodes_per_batch
            # train PPO
            print('train PPO')
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()
    env.close()

Exemple #13

0

Afficher le fichier

def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0, 0]
        max_ = dummy_env.observation_space.high[0, 0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, data=h5py.File('bball_strategies/data/OrderedGAILTransitionData_522.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.max_length, config.D_len)), video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)

    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, dummy_env)
    # summary writer of Discriminator
    summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator')
    # TF Session
    # NOTE: _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume)
        # NOTE reset variables in optimizer for different stages of curriculum learning
        opt_reset_D = tf.group(
            [v.initializer for v in graph.algo.D.optimizer.variables()])
        # reset PPO optimizer
        opt_reset = tf.group(
            [v.initializer for v in graph.algo._optimizer.variables()])
        sess.run([opt_reset, opt_reset_D])
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action)
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action, stochastic=True)
            exit()
        # GAIL
        cumulate_steps = sess.run(graph.step)
        counter = 0
        while True:
            # train Discriminator
            gail_timer = time.time()
            if counter > config.pretrain_d_times:
                num_d_to_train = config.train_d_per_ppo
            else:
                num_d_to_train = config.pretrain_d_per_ppo
            for _ in range(num_d_to_train):
                # train D
                feed_dict = {
                    graph.is_training: True,
                    graph.should_log: True,
                    graph.do_report: True,
                    graph.force_reset: False}
                gail_counter = 0
                while gail_counter < config.gail_steps:
                    gail_summary = sess.run(
                        graph.gail_summary, feed_dict=feed_dict)
                    if gail_summary:
                        summary_writer.add_summary(
                            gail_summary, global_step=sess.run(graph.algo.D._steps))
                    gail_counter += 1
                # testing
                if counter % (config.vis_testing_freq) == 0:
                    test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy,
                                graph.algo.D, denormalize_observ)
                if counter % (config.tally_line_chart_freq) == 0:
                    tally_reward_line_chart(config, sess.run(
                        graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action)
                    tally_reward_line_chart(config, sess.run(
                        graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action, stochastic=True)
                counter += 1
            print('Time Cost of Discriminator per Update: {}'.format(
                (time.time() - gail_timer) / num_d_to_train))
            # train ppo
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()

Exemple #14

0

Afficher le fichier

Fichier : train.py Projet : cwbeitel/kubeflow-rl-1

def train(agents_config, env_processes=True, log_dir=None):
    """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """

    FLAGS = tf.app.flags.FLAGS

    if log_dir is None and hasattr(FLAGS, 'log_dir'):
        log_dir = FLAGS.log_dir

    run_config = tf.contrib.learn.RunConfig()

    _log_run_config(run_config)

    server = tf.train.Server(run_config.cluster_spec,
                             job_name=run_config.task_type,
                             task_index=run_config.task_id)

    tf.reset_default_graph()

    if agents_config.update_every % agents_config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')

    worker_device = "/job:%s/replica:0/task:%d" % (run_config.task_type,
                                                   run_config.task_id)

    with tf.device(worker_device):

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device=worker_device,
                    cluster=run_config.cluster_spec)):
            global_step = tf.Variable(0,
                                      False,
                                      dtype=tf.int32,
                                      name='global_step')

        batch_env = define_batch_env(
            lambda: _create_environment(agents_config),
            agents_config.num_agents, env_processes)

        optimizer = agents_config.optimizer(agents_config.learning_rate)

        if FLAGS.sync_replicas:
            optimizer = tf.train.SyncReplicasOptimizer(
                optimizer,
                replicas_to_aggregate=(run_config.num_worker_replicas),
                total_num_replicas=(run_config.num_worker_replicas))

        with agents_config.unlocked:
            agents_config.optimizer = optimizer

        graph = define_simulation_graph(batch_env, agents_config.algorithm,
                                        agents_config, global_step)

        loop = _define_loop(
            graph, log_dir,
            agents_config.update_every * agents_config.max_length,
            agents_config.eval_episodes * agents_config.max_length)

        total_steps = int(
            agents_config.steps / agents_config.update_every *
            (agents_config.update_every + agents_config.eval_episodes))

        # Exclude episode related variables since the Python state of environments is
        # not checkpointed and thus new episodes start after resuming.
        saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        if FLAGS.log_device_placement:
            sess_config.log_device_placement = True

        sess_config.gpu_options.allow_growth = True

        init_op = tf.global_variables_initializer()
        local_init_op = tf.local_variables_initializer()

        hooks = [tf.train.StopAtStepHook(last_step=total_steps)]

        if FLAGS.sync_replicas:
            opt = graph.algo._optimizer
            sync_replicas_hook = opt.make_session_run_hook(run_config.is_chief)
            hooks.append(sync_replicas_hook)

        scaffold = tf.train.Scaffold(saver=saver,
                                     init_op=init_op,
                                     local_init_op=local_init_op)

        # if FLAGS.sync_replicas:
        #   opt = graph.algo._optimizer
        #   local_init_op = opt.local_step_init_op
        #   if run_config.is_chief:
        #     local_init_op = opt.chief_init_op
        #
        #   ready_for_local_init_op = opt.ready_for_local_init_op
        #
        #   # Initial token and chief queue runners required by the sync_replicas mode
        #   chief_queue_runner = opt.get_chief_queue_runner()
        #   sync_init_op = opt.get_init_tokens_op()

        # if FLAGS.sync_replicas:
        #   sv = tf.train.Supervisor(
        #       is_chief=run_config.is_chief,
        #       logdir=log_dir,
        #       init_op=init_op,
        #       local_init_op=local_init_op,
        #       ready_for_local_init_op=ready_for_local_init_op,
        #       recovery_wait_secs=1,
        #       global_step=global_step)
        # else:
        #   sv = tf.train.Supervisor(
        #       is_chief=run_config.is_chief,
        #       logdir=log_dir,
        #       init_op=init_op,
        #       recovery_wait_secs=1,
        #       global_step=global_step)
        # with sv.prepare_or_wait_for_session(server.target, config=sess_config) as sess:
        # if FLAGS.sync_replicas and is_chief:
        #   # Chief worker will start the chief queue runner and call the init op.
        #   sess.run(sync_init_op)
        #   sv.start_queue_runners(sess, [chief_queue_runner])

        with tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=run_config.is_chief,
                checkpoint_dir=log_dir,
                scaffold=scaffold,
                hooks=hooks,
                save_checkpoint_secs=FLAGS.save_checkpoint_secs,
                save_summaries_steps=None,
                save_summaries_secs=None,
                config=sess_config,
                stop_grace_period_secs=120,
                log_step_count_steps=3000) as sess:

            global_step = sess.run(loop._step)
            steps_made = 1

            while not sess.should_stop():

                phase, epoch, steps_in = loop._find_current_phase(global_step)
                phase_step = epoch * phase.steps + steps_in

                if steps_in % phase.steps < steps_made:
                    message = '\n' + ('-' * 50) + '\n'
                    message += 'Phase {} (phase step {}, global step {}).'
                    tf.logging.info(
                        message.format(phase.name, phase_step, global_step))

                phase.feed[loop._reset] = (steps_in < steps_made)

                phase.feed[loop._log] = (phase.writer and loop._is_every_steps(
                    phase_step, phase.batch, phase.log_every))
                phase.feed[loop._report] = (loop._is_every_steps(
                    phase_step, phase.batch, phase.report_every))

                summary, mean_score, global_step, steps_made = sess.run(
                    phase.op, phase.feed)

                if loop._is_every_steps(
                        phase_step, phase.batch,
                        phase.checkpoint_every) and run_config.is_chief:
                    loop._store_checkpoint(sess, saver, global_step)

                if loop._is_every_steps(phase_step, phase.batch,
                                        phase.report_every):
                    yield mean_score

                # TODO: Potentially integrate summary writing with
                # MonitoredTrainingSession.
                if summary and phase.writer and run_config.is_chief:
                    # We want smaller phases to catch up at the beginnig of each epoch so
                    # that their graphs are aligned.
                    longest_phase = max(phase.steps for phase in loop._phases)
                    summary_step = epoch * longest_phase + steps_in
                    phase.writer.add_summary(summary, summary_step)

        batch_env.close()

Exemple #15

0

Afficher le fichier

def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False,
                               time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.train_len, config.D_len)), if_back_real=False, video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)
    vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy')
    # env to generate fake state
    env = gym.make(config.env)
    env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real,
                       time_limit=config.max_length)
    env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real,
                         # init from dataset in order
                         init_mode=3)
    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, env)
    # Data
    all_data = h5py.File(
        'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r')
    expert_data, valid_expert_data = np.split(
        all_data['OBS'].value, [all_data['OBS'].value.shape[0] * 9 // 10])
    expert_action, valid_expert_action = np.split(
        all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0] * 9 // 10])
    print('expert_data', expert_data.shape)
    print('valid_expert_data', valid_expert_data.shape)
    print('expert_action', expert_action.shape)
    print('valid_expert_action', valid_expert_action.shape)
    # Preprocessing/ Normalization
    expert_data = normalize_observ(expert_data)
    valid_expert_data = normalize_observ(valid_expert_data)
    expert_action = normalize_action(expert_action)
    valid_expert_action = normalize_action(valid_expert_action)
    # summary writer of Discriminator
    summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator')
    # TF Session
    # TODO _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, resume=FLAGS.resume)
        # NOTE reset variables in optimizer
        # opt_reset_D = tf.group(
        #     [v.initializer for v in graph.algo.D.optimizer.variables()])
        # # reset PPO optimizer
        # opt_reset = tf.group(
        #     [v.initializer for v in graph.algo._optimizer.variables()])
        # sess.run([opt_reset, opt_reset_D])
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            exit()
        
        # GAIL
        cumulate_steps = sess.run(graph.step)
        episode_idx = 0
        while True:
            if episode_idx > (expert_data.shape[0] - config.episodes_per_batch * config.train_d_per_ppo) or episode_idx == 0:
                episode_idx = 0
                perm_idx = np.random.permutation(expert_data.shape[0])
                expert_data = expert_data[perm_idx]
                expert_action = expert_action[perm_idx]
            # # testing
            if episode_idx % (config.train_d_per_ppo * 100 * config.episodes_per_batch) == 0:
                test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy,
                            graph.algo.D, denormalize_observ)
            if episode_idx % (config.train_d_per_ppo * 1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ, normalize_observ, normalize_action)

            # # train Discriminator
            gail_timer = time.time()
            for _ in range(config.train_d_per_ppo):
                if config.is_double_curiculum:
                    observ = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:]
                    action = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1]
                    if config.use_padding:
                        # 1. padding with buffer
                        buffer = observ[:, 0, :-1]
                        padded_observ = np.concatenate([buffer, observ[:, :, -1]], axis=1)
                        padded_act = np.concatenate([np.zeros(shape=[action.shape[0], 9, 5, 2]), action], axis=1)
                        # 2. split the whole episode into training data of Discriminator with length=config.D_len
                        training_obs = []
                        training_act = []
                        for i in range(config.max_length-config.D_len+10):
                            training_obs.append(padded_observ[:, i:i+config.D_len])
                            training_act.append(padded_act[:, i:i+config.D_len])
                        training_obs = np.concatenate(training_obs, axis=0)
                        training_act = np.concatenate(training_act, axis=0)
                    else:
                        pass
                else:
                    training_obs = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:, -1]
                    training_act = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1]
                feed_dict = {
                    graph.is_training: True,
                    graph.should_log: True,
                    graph.do_report: True,
                    graph.force_reset: False,
                    graph.algo.D._expert_s: training_obs,
                    graph.algo.D._expert_a: training_act}
                gail_counter = 0
                while gail_counter < config.gail_steps:
                    gail_summary = sess.run(
                        graph.gail_summary, feed_dict=feed_dict)
                    if gail_summary:
                        summary_writer.add_summary(
                            gail_summary, global_step=sess.run(graph.algo.D._steps))
                    gail_counter += 1
                episode_idx += config.episodes_per_batch
            print('Time Cost of Discriminator per Update: {}'.format(
                (time.time() - gail_timer) / config.train_d_per_ppo))
            # train ppo
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()
    env.close()