Beispiel #1
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_v=1,
                 scope_name="model",
                 policy_config=None):
        sess = tf.Session()
        self.ob_space = ob_space
        self.ac_space = ac_space
        policy_config = {} if policy_config is None else policy_config

        def create_policy():
            return policy(ob_space,
                          ac_space,
                          n_v=n_v,
                          nbatch=1,
                          scope_name=scope_name,
                          **policy_config)

        with sess.as_default():
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    with tf.xla.experimental.jit_scope(True):
                        self.policy = create_policy()
                except:
                    logger.warn(
                        "using tf.xla in PPOAgent requires tf version>=1.15.")
                    self.policy = create_policy()
            else:
                self.policy = create_policy()
        self._state = self.policy.initial_state
        params = tf.trainable_variables(scope=scope_name)
        new_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in params
        ]
        param_assign_ops = [
            p.assign(new_p) for p, new_p in zip(params, new_params)
        ]
        tf.global_variables_initializer().run(session=sess)

        def load_model(loaded_params):
            sess.run(
                param_assign_ops[:len(loaded_params)],
                feed_dict={p: v
                           for p, v in zip(new_params, loaded_params)})

        self.load_model = load_model
Beispiel #2
0
    def __init__(self,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_ports,
                 rm_size,
                 batch_size,
                 ob_space,
                 ac_space,
                 policy,
                 gpu_id,
                 policy_config={},
                 ent_coef=1e-2,
                 distill_coef=1e-2,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 rwd_shape=False,
                 pub_interval=500,
                 log_interval=100,
                 save_interval=0,
                 total_timesteps=5e7,
                 burn_in_timesteps=0,
                 learner_id='',
                 batch_worker_num=4,
                 pull_worker_num=2,
                 unroll_length=32,
                 rollout_length=1,
                 use_mixed_precision=False,
                 use_sparse_as_dense=True,
                 adam_beta1=0.9,
                 adam_beta2=0.999,
                 adam_eps=1e-5,
                 data_type=PGData,
                 data_server_version='v1',
                 decode=False,
                 log_infos_interval=20,
                 **kwargs):
        super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs,
                                        learner_ports, learner_id)

        self.LR = tf.placeholder(tf.float32, [])
        """Learning Rate"""

        self.CLIPRANGE = tf.placeholder(tf.float32, [])
        """Learning Rate Clip Range"""

        self.ep_loss_coef = {}
        """Coefficients for those losses from the endpoints. Override it in derived
     class."""

        # TODO(pengsun): fix the policy_config default value
        self._init_const(total_timesteps, burn_in_timesteps, batch_size,
                         unroll_length, rwd_shape, ent_coef, vf_coef,
                         pub_interval, log_interval, save_interval, policy,
                         distill_coef, policy_config, rollout_length)

        # allow_soft_placement=True can fix issue when some op cannot be defined on
        # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(gpu_id)
        self.sess = tf.Session(config=config)
        self.rank = hvd.rank() if has_hvd else 0

        # Prepare dataset
        ds = data_type(ob_space,
                       ac_space,
                       self.n_v,
                       use_lstm=self.rnn,
                       hs_len=self.hs_len,
                       distillation=self.distillation,
                       version='v2')
        self._data_server = DataServer(self._pull_data,
                                       rm_size,
                                       unroll_length,
                                       batch_size,
                                       ds,
                                       gpu_id_list=(0, ),
                                       batch_worker_num=batch_worker_num,
                                       pull_worker_num=pull_worker_num,
                                       rollout_length=rollout_length,
                                       prefetch_buffer_size=2,
                                       version=data_server_version,
                                       decode=decode,
                                       log_infos_interval=log_infos_interval)

        # prepare net config
        net_config = policy.net_config_cls(ob_space, ac_space, **policy_config)
        net_config.clip_range = self.CLIPRANGE
        if rwd_shape:
            # make net_config.reward-shaping-weights a tf.placeholder so as to change
            # it during training.
            # NOTE: Assume there is reward_weights_shape in net_config
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            reward_weights_shape = net_config.reward_weights_shape
            self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape)
            net_config.reward_weights = self.rwd_weights
        if hasattr(net_config, 'lam'):
            # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it
            #  during training.
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            self.LAM = tf.placeholder(tf.float32, [])
            net_config.lam = self.LAM
        else:
            self.LAM = None

        # build the policy net
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy.net_build_fun(inputs=inputs,
                                        nc=nc,
                                        scope=model_scope)

        device = '/gpu:{}'.format(0)
        with tf.device(device):
            input_data = self._data_server.input_datas[0]
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(input_data, net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(input_data, net_config)
            else:
                model = create_policy(input_data, net_config)
            loss, vf_loss, losses = self.build_loss(model, input_data)
        if has_hvd:
            self.losses = [hvd.allreduce(loss) for loss in losses]
        else:
            self.losses = list(losses)
        self.params = tf.trainable_variables(scope='model')
        self.params_vf = tf.trainable_variables(scope='model/vf')
        self.param_norm = tf.global_norm(self.params)

        self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR,
                                              beta1=adam_beta1,
                                              beta2=adam_beta2,
                                              epsilon=adam_eps)
        self.burn_in_trainer = tf.train.AdamOptimizer(
            learning_rate=self.LR, epsilon=1e-5)  # same as default and IL
        if use_mixed_precision:
            try:
                self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.trainer)
                self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.burn_in_trainer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if has_hvd:
            self.trainer = hvd.DistributedOptimizer(
                self.trainer, sparse_as_dense=use_sparse_as_dense)
            self.burn_in_trainer = hvd.DistributedOptimizer(
                self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense)
        grads_and_vars = self.trainer.compute_gradients(loss, self.params)
        grads_and_vars_vf = self.burn_in_trainer.compute_gradients(
            vf_loss, self.params_vf)
        clip_vars = model.vars.lstm_vars
        grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars(
            grads_and_vars, clip_vars, max_grad_norm)
        grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars(
            grads_and_vars_vf, clip_vars, max_grad_norm)

        self._train_batch = self.trainer.apply_gradients(grads_and_vars)
        self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf)
        self.loss_endpoints_names = model.loss.loss_endpoints.keys()
        self._build_ops()
        if has_hvd:
            barrier_op = hvd.allreduce(tf.Variable(0.))
            broadcast_op = hvd.broadcast_global_variables(0)
        tf.global_variables_initializer().run(session=self.sess)
        self.sess.graph.finalize()

        self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None
        self.broadcast = lambda: self.sess.run(broadcast_op
                                               ) if has_hvd else None
        self.broadcast()
        # logging stuff
        format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank
                       == 0 else ['stdout', 'log', 'tensorboard', 'csv'])
        logger.configure(dir='training_log/{}rank{}'.format(
            self._learner_id, self.rank),
                         format_strs=format_strs)
    def __init__(self,
                 ports,
                 gpu_id,
                 replay_filelist,
                 batch_size,
                 min_train_sample_num,
                 min_val_sample_num,
                 rm_size,
                 learning_rate,
                 print_interval,
                 checkpoint_interval,
                 num_val_batches,
                 replay_converter_type,
                 policy,
                 policy_config,
                 converter_config=None,
                 policy_config_type=None,
                 model_pool_addrs=None,
                 rollout_length=1,
                 checkpoints_dir=None,
                 restore_checkpoint_path=None,
                 train_generator_worker_num=4,
                 val_generator_worker_num=2,
                 pull_worker_num=2,
                 num_sgd_updates=int(1e30),
                 repeat_training_task=False,
                 unroll_length=32,
                 pub_interval=50,
                 max_clip_grad_norm=1,
                 after_loading_init_scope=None,
                 use_mixed_precision=False,
                 use_sparse_as_dense=False,
                 enable_validation=True,
                 post_process_data=None):
        assert len(ports) == 2
        self.use_hvd = has_hvd and hvd.size() > 1
        self.rank = 0 if not self.use_hvd else hvd.rank()
        self.model_key = 'IL-model'
        self.pub_interval = pub_interval
        self.rnn = (False if 'use_lstm' not in policy_config else
                    policy_config['use_lstm'])
        self.hs_len = None
        # overwrite it using the batch_size for training
        policy_config['batch_size'] = batch_size
        if self.rnn:
            assert model_pool_addrs is not None
            self._model_pool_apis = ModelPoolAPIs(model_pool_addrs)
            self._model_pool_apis.check_server_set_up()
            policy_config['rollout_len'] = rollout_length
            # infer hidden state length (size)
            if 'hs_len' in policy_config:
                self.hs_len = policy_config['hs_len']
            elif 'nlstm' in policy_config:
                self.hs_len = 2 * policy_config['nlstm']
            else:
                self.hs_len = 128

        self.should_push_model = (self.rnn and self.rank == 0)
        use_gpu = (gpu_id >= 0)
        converter_config = {} if converter_config is None else converter_config
        train_replay_filelist, val_replay_filelist = _get_local_replays(
            replay_filelist)
        replay_converter = replay_converter_type(**converter_config)
        ob_space, ac_space = replay_converter.space.spaces
        if post_process_data is not None:
            ob_space, ac_space = post_process_data(ob_space, ac_space)
        self.data_pool = ImDataServer(
            ports=ports,
            train_replay_filelist=train_replay_filelist,
            val_replay_filelist=val_replay_filelist,
            batch_size=batch_size,
            min_train_sample_num=min_train_sample_num,
            min_val_sample_num=min_val_sample_num,
            ob_space=ob_space,
            ac_space=ac_space,
            train_generator_worker_num=train_generator_worker_num,
            val_generator_worker_num=val_generator_worker_num,
            pull_worker_num=pull_worker_num,
            rm_size=rm_size,
            repeat_training_task=repeat_training_task,
            unroll_length=unroll_length,
            rollout_length=rollout_length,
            lstm=self.rnn,
            hs_len=self.hs_len,
            use_gpu=use_gpu)
        self._enable_validation = enable_validation

        config = tf.ConfigProto(allow_soft_placement=True)
        if use_gpu:
            config.gpu_options.visible_device_list = str(gpu_id)
            config.gpu_options.allow_growth = True
        self._sess = tf.Session(config=config)

        net_config = policy_config_type(ob_space, ac_space, **policy_config)
        net_config_val = deepcopy(net_config)
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy(inputs=inputs, nc=nc, scope=model_scope)

        if hasattr(net_config, 'endpoints_verbosity'):
            # intentionally disables endpoints during training
            net_config.endpoints_verbosity = 0
        device = '/gpu:0' if use_gpu else '/cpu:0'
        with tf.device(device):
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(self.data_pool.train_batch_input,
                                              net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(self.data_pool.train_batch_input,
                                          net_config)
            else:
                model = create_policy(self.data_pool.train_batch_input,
                                      net_config)

        model_val = create_policy(self.data_pool.val_batch_input,
                                  net_config_val)
        params = tf.trainable_variables(scope='model')
        param_norm = tf.global_norm(params)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           epsilon=1e-5)
        if use_mixed_precision:
            try:
                optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if self.use_hvd:
            optimizer = hvd.DistributedOptimizer(
                optimizer, sparse_as_dense=use_sparse_as_dense)
            barrier_op = hvd.allreduce(tf.Variable(0.))
            self.barrier = lambda: self._sess.run(barrier_op)
        train_loss = tf.reduce_mean(model.loss.total_il_loss *
                                    self.data_pool.train_batch_weight)
        val_loss = tf.reduce_mean(model_val.loss.total_il_loss *
                                  self.data_pool.val_batch_weight)
        if hasattr(net_config, 'weight_decay') and not net_config.weight_decay:
            # None or 0.0
            total_loss = train_loss
        else:
            total_loss = train_loss + model.loss.total_reg_loss
        grads_and_vars = optimizer.compute_gradients(total_loss, params)
        clip_vars = model.vars.lstm_vars
        clip_grads = [grad for grad, var in grads_and_vars if var in clip_vars]
        nonclip_grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                                  if var not in clip_vars]
        if max_clip_grad_norm > 0:
            clip_grads, clip_grad_norm = tf.clip_by_global_norm(
                clip_grads, max_clip_grad_norm)
        else:
            clip_grad_norm = tf.global_norm(clip_grads)
        clip_grads_and_var = list(zip(clip_grads, clip_vars))
        grads_and_vars = clip_grads_and_var + nonclip_grads_and_vars
        grad_norm = tf.global_norm(list(zip(*grads_and_vars))[0])

        train_op = optimizer.apply_gradients(grads_and_vars)
        tf.global_variables_initializer().run(session=self._sess)

        self.new_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in params
        ]
        self.param_assign_ops = [
            p.assign(new_p) for p, new_p in zip(params, self.new_params)
        ]
        opt_params = optimizer.variables()
        self.new_opt_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in opt_params
        ]
        self.opt_param_assign_ops = [
            p.assign(new_p)
            for p, new_p in zip(opt_params, self.new_opt_params)
        ]

        def read_params():
            return self._sess.run(params)

        def read_opt_params():
            return self._sess.run(opt_params)

        def load_model(np_new_params):
            self._sess.run(
                self.param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_params, np_new_params)
                })

        def restore_optimizer(np_new_opt_params):
            self._sess.run(
                self.opt_param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_opt_params, np_new_opt_params)
                })

        def _train_step():
            return self._sess.run([
                train_loss_aggregated, *train_other_losses_aggregated,
                grad_norm, clip_grad_norm, param_norm, train_op
            ], {})[:-1]

        def _val_step():
            # maximal_feat = [tf.reduce_max(tf.cast(x, tf.float32))
            # for x in self.data_pool.val_batch_input.X]
            # print(self._sess.run(maximal_feat, {}))
            return self._sess.run([
                val_loss_aggregated, *val_other_losses_aggregated,
                *endpoints_aggregated
            ], {})

        self._saver = ChkptsFromSelf(read_params, load_model, self.model_key)

        if restore_checkpoint_path is not None:
            self._saver._restore_model_checkpoint(restore_checkpoint_path)

        if after_loading_init_scope is not None:
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=after_loading_init_scope)
            logger.log('perform after loading init for vars')
            for v in var_list:
                logger.log(v)
            tf.variables_initializer(var_list).run(session=self._sess)

        if self.use_hvd:
            hvd.broadcast_global_variables(0).run(session=self._sess)

        _allreduce = lambda x: x if not self.use_hvd else hvd.allreduce(x)
        train_loss_aggregated = _allreduce(train_loss)
        train_other_loss_names = model.loss.loss_endpoints.keys()
        train_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.train_batch_weight))
            for l in model.loss.loss_endpoints.values()
        ]
        val_loss_aggregated = _allreduce(val_loss)
        val_other_loss_names = model_val.loss.loss_endpoints.keys()
        val_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.val_batch_weight))
            for l in model_val.loss.loss_endpoints.values()
        ]
        endpoints_names = model_val.endpoints.keys()
        endpoints_aggregated = [
            _allreduce(tf.reduce_mean(l))
            for l in model_val.endpoints.values()
        ]
        self._sess.graph.finalize()
        self._total_samples = lambda: [
            self.data_pool._num_train_samples, self.data_pool._num_val_samples
        ]
        self._train_log_names = (['loss'] + list(train_other_loss_names) +
                                 ['grad_norm', 'clip_grad_norm', 'param_norm'])
        self._val_log_names = (['loss'] + list(val_other_loss_names) +
                               list(endpoints_names))
        self._batch_size = batch_size
        self._train_step = _train_step
        self._val_step = _val_step
        self._print_interval = print_interval
        self._checkpoint_interval = checkpoint_interval
        self._num_val_batches = num_val_batches
        self._checkpoints_dir = checkpoints_dir if self.rank == 0 else None
        self._num_sgd_updates = num_sgd_updates
        self.load_model = load_model
        self.restore_optimizer = restore_optimizer
        self.read_params = read_params
        self.read_opt_params = read_opt_params

        format_strs = ['log', 'tensorboard', 'csv']
        logger.configure(dir='training_log/rank{}'.format(self.rank),
                         format_strs=['stdout'] + format_strs)
        with logger.scoped_configure(dir='validation_log/rank{}'.format(
                self.rank),
                                     format_strs=['stderr'] + format_strs):
            self.val_logger = logger.Logger.CURRENT