Esempio n. 1
0
    def __init__(self,
                 env,
                 policy,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_addr=None,
                 unroll_length=32,
                 update_model_freq=32,
                 n_v=1,
                 verbose=0,
                 rwd_shape=True,
                 log_interval_steps=51):
        super(PPOActor, self).__init__(league_mgr_addr,
                                       model_pool_addrs,
                                       learner_addr,
                                       verbose=verbose,
                                       log_interval_steps=log_interval_steps)
        logger.configure(dir=None, format_strs=['stdout'])
        logger.set_level(verbose)

        self.env = env
        self.env.reset()
        sp = self.env.observation_space.spaces[0]
        sp = spaces.Box(low=0, high=1, shape=sp.shape)
        self.obs_space = spaces.Tuple([sp] * 2)
        self.agents = [
            PPOAgent(policy,
                     ob_space,
                     ac_space,
                     n_v=n_v,
                     scope_name=scope_name)
            for ob_space, ac_space, scope_name in zip(
                self.env.observation_space.spaces,
                self.env.action_space.spaces, ["self"] + _get_oppo_names(env))
        ]
        self.env.close()
        self._learning_agent_id = 0
        self._enable_push = learner_addr is not None
        self._update_model_freq = update_model_freq
        self._unroll_length = unroll_length
        self._gamma = 0.95
        self._lam = 0.9
        self._reward_weights = None
        self.n_v = n_v  # reward/value length
        self.models = [None, None]
        self.rwd_shape = rwd_shape
        self.should_log_info = True  # TODO(pengsun): make it an argument
        if self._enable_push:
            self._data_queue = Queue(unroll_length)
            self._push_thread = Thread(target=self._push_data,
                                       args=(self._data_queue, ))
            self._push_thread.daemon = True
            self._push_thread.start()
Esempio n. 2
0
    def __init__(self,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_addr=None,
                 verbose=0,
                 log_interval_steps=51):
        ip, hostname = get_ip_hostname()
        self._actor_id = hostname + '@' + ip + ':' + str(uuid.uuid1())[:8]
        self._learner_id = None
        self._league_mgr_apis = LeagueMgrAPIs(league_mgr_addr)
        self._model_pool_apis = ModelPoolAPIs(model_pool_addrs)
        if learner_addr:
            self._learner_apis = LearnerAPIs(learner_addr)
            self._learner_id = self._learner_apis.request_learner_id()

        self._log_interval_steps = log_interval_steps
        logger.configure(dir=None, format_strs=['stdout'])
        logger.set_level(verbose)
        self.task = None
        self._steps = 0
Esempio n. 3
0
    def __init__(self,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_ports,
                 rm_size,
                 batch_size,
                 ob_space,
                 ac_space,
                 policy,
                 gpu_id,
                 policy_config={},
                 ent_coef=1e-2,
                 distill_coef=1e-2,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 rwd_shape=False,
                 pub_interval=500,
                 log_interval=100,
                 save_interval=0,
                 total_timesteps=5e7,
                 burn_in_timesteps=0,
                 learner_id='',
                 batch_worker_num=4,
                 pull_worker_num=2,
                 unroll_length=32,
                 rollout_length=1,
                 use_mixed_precision=False,
                 use_sparse_as_dense=True,
                 adam_beta1=0.9,
                 adam_beta2=0.999,
                 adam_eps=1e-5,
                 data_type=PGData,
                 data_server_version='v1',
                 decode=False,
                 log_infos_interval=20,
                 **kwargs):
        super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs,
                                        learner_ports, learner_id)

        self.LR = tf.placeholder(tf.float32, [])
        """Learning Rate"""

        self.CLIPRANGE = tf.placeholder(tf.float32, [])
        """Learning Rate Clip Range"""

        self.ep_loss_coef = {}
        """Coefficients for those losses from the endpoints. Override it in derived
     class."""

        # TODO(pengsun): fix the policy_config default value
        self._init_const(total_timesteps, burn_in_timesteps, batch_size,
                         unroll_length, rwd_shape, ent_coef, vf_coef,
                         pub_interval, log_interval, save_interval, policy,
                         distill_coef, policy_config, rollout_length)

        # allow_soft_placement=True can fix issue when some op cannot be defined on
        # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(gpu_id)
        self.sess = tf.Session(config=config)
        self.rank = hvd.rank() if has_hvd else 0

        # Prepare dataset
        ds = data_type(ob_space,
                       ac_space,
                       self.n_v,
                       use_lstm=self.rnn,
                       hs_len=self.hs_len,
                       distillation=self.distillation,
                       version='v2')
        self._data_server = DataServer(self._pull_data,
                                       rm_size,
                                       unroll_length,
                                       batch_size,
                                       ds,
                                       gpu_id_list=(0, ),
                                       batch_worker_num=batch_worker_num,
                                       pull_worker_num=pull_worker_num,
                                       rollout_length=rollout_length,
                                       prefetch_buffer_size=2,
                                       version=data_server_version,
                                       decode=decode,
                                       log_infos_interval=log_infos_interval)

        # prepare net config
        net_config = policy.net_config_cls(ob_space, ac_space, **policy_config)
        net_config.clip_range = self.CLIPRANGE
        if rwd_shape:
            # make net_config.reward-shaping-weights a tf.placeholder so as to change
            # it during training.
            # NOTE: Assume there is reward_weights_shape in net_config
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            reward_weights_shape = net_config.reward_weights_shape
            self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape)
            net_config.reward_weights = self.rwd_weights
        if hasattr(net_config, 'lam'):
            # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it
            #  during training.
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            self.LAM = tf.placeholder(tf.float32, [])
            net_config.lam = self.LAM
        else:
            self.LAM = None

        # build the policy net
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy.net_build_fun(inputs=inputs,
                                        nc=nc,
                                        scope=model_scope)

        device = '/gpu:{}'.format(0)
        with tf.device(device):
            input_data = self._data_server.input_datas[0]
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(input_data, net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(input_data, net_config)
            else:
                model = create_policy(input_data, net_config)
            loss, vf_loss, losses = self.build_loss(model, input_data)
        if has_hvd:
            self.losses = [hvd.allreduce(loss) for loss in losses]
        else:
            self.losses = list(losses)
        self.params = tf.trainable_variables(scope='model')
        self.params_vf = tf.trainable_variables(scope='model/vf')
        self.param_norm = tf.global_norm(self.params)

        self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR,
                                              beta1=adam_beta1,
                                              beta2=adam_beta2,
                                              epsilon=adam_eps)
        self.burn_in_trainer = tf.train.AdamOptimizer(
            learning_rate=self.LR, epsilon=1e-5)  # same as default and IL
        if use_mixed_precision:
            try:
                self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.trainer)
                self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.burn_in_trainer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if has_hvd:
            self.trainer = hvd.DistributedOptimizer(
                self.trainer, sparse_as_dense=use_sparse_as_dense)
            self.burn_in_trainer = hvd.DistributedOptimizer(
                self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense)
        grads_and_vars = self.trainer.compute_gradients(loss, self.params)
        grads_and_vars_vf = self.burn_in_trainer.compute_gradients(
            vf_loss, self.params_vf)
        clip_vars = model.vars.lstm_vars
        grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars(
            grads_and_vars, clip_vars, max_grad_norm)
        grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars(
            grads_and_vars_vf, clip_vars, max_grad_norm)

        self._train_batch = self.trainer.apply_gradients(grads_and_vars)
        self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf)
        self.loss_endpoints_names = model.loss.loss_endpoints.keys()
        self._build_ops()
        if has_hvd:
            barrier_op = hvd.allreduce(tf.Variable(0.))
            broadcast_op = hvd.broadcast_global_variables(0)
        tf.global_variables_initializer().run(session=self.sess)
        self.sess.graph.finalize()

        self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None
        self.broadcast = lambda: self.sess.run(broadcast_op
                                               ) if has_hvd else None
        self.broadcast()
        # logging stuff
        format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank
                       == 0 else ['stdout', 'log', 'tensorboard', 'csv'])
        logger.configure(dir='training_log/{}rank{}'.format(
            self._learner_id, self.rank),
                         format_strs=format_strs)
    def __init__(self,
                 ports,
                 gpu_id,
                 replay_filelist,
                 batch_size,
                 min_train_sample_num,
                 min_val_sample_num,
                 rm_size,
                 learning_rate,
                 print_interval,
                 checkpoint_interval,
                 num_val_batches,
                 replay_converter_type,
                 policy,
                 policy_config,
                 converter_config=None,
                 policy_config_type=None,
                 model_pool_addrs=None,
                 rollout_length=1,
                 checkpoints_dir=None,
                 restore_checkpoint_path=None,
                 train_generator_worker_num=4,
                 val_generator_worker_num=2,
                 pull_worker_num=2,
                 num_sgd_updates=int(1e30),
                 repeat_training_task=False,
                 unroll_length=32,
                 pub_interval=50,
                 max_clip_grad_norm=1,
                 after_loading_init_scope=None,
                 use_mixed_precision=False,
                 use_sparse_as_dense=False,
                 enable_validation=True,
                 post_process_data=None):
        assert len(ports) == 2
        self.use_hvd = has_hvd and hvd.size() > 1
        self.rank = 0 if not self.use_hvd else hvd.rank()
        self.model_key = 'IL-model'
        self.pub_interval = pub_interval
        self.rnn = (False if 'use_lstm' not in policy_config else
                    policy_config['use_lstm'])
        self.hs_len = None
        # overwrite it using the batch_size for training
        policy_config['batch_size'] = batch_size
        if self.rnn:
            assert model_pool_addrs is not None
            self._model_pool_apis = ModelPoolAPIs(model_pool_addrs)
            self._model_pool_apis.check_server_set_up()
            policy_config['rollout_len'] = rollout_length
            # infer hidden state length (size)
            if 'hs_len' in policy_config:
                self.hs_len = policy_config['hs_len']
            elif 'nlstm' in policy_config:
                self.hs_len = 2 * policy_config['nlstm']
            else:
                self.hs_len = 128

        self.should_push_model = (self.rnn and self.rank == 0)
        use_gpu = (gpu_id >= 0)
        converter_config = {} if converter_config is None else converter_config
        train_replay_filelist, val_replay_filelist = _get_local_replays(
            replay_filelist)
        replay_converter = replay_converter_type(**converter_config)
        ob_space, ac_space = replay_converter.space.spaces
        if post_process_data is not None:
            ob_space, ac_space = post_process_data(ob_space, ac_space)
        self.data_pool = ImDataServer(
            ports=ports,
            train_replay_filelist=train_replay_filelist,
            val_replay_filelist=val_replay_filelist,
            batch_size=batch_size,
            min_train_sample_num=min_train_sample_num,
            min_val_sample_num=min_val_sample_num,
            ob_space=ob_space,
            ac_space=ac_space,
            train_generator_worker_num=train_generator_worker_num,
            val_generator_worker_num=val_generator_worker_num,
            pull_worker_num=pull_worker_num,
            rm_size=rm_size,
            repeat_training_task=repeat_training_task,
            unroll_length=unroll_length,
            rollout_length=rollout_length,
            lstm=self.rnn,
            hs_len=self.hs_len,
            use_gpu=use_gpu)
        self._enable_validation = enable_validation

        config = tf.ConfigProto(allow_soft_placement=True)
        if use_gpu:
            config.gpu_options.visible_device_list = str(gpu_id)
            config.gpu_options.allow_growth = True
        self._sess = tf.Session(config=config)

        net_config = policy_config_type(ob_space, ac_space, **policy_config)
        net_config_val = deepcopy(net_config)
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy(inputs=inputs, nc=nc, scope=model_scope)

        if hasattr(net_config, 'endpoints_verbosity'):
            # intentionally disables endpoints during training
            net_config.endpoints_verbosity = 0
        device = '/gpu:0' if use_gpu else '/cpu:0'
        with tf.device(device):
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(self.data_pool.train_batch_input,
                                              net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(self.data_pool.train_batch_input,
                                          net_config)
            else:
                model = create_policy(self.data_pool.train_batch_input,
                                      net_config)

        model_val = create_policy(self.data_pool.val_batch_input,
                                  net_config_val)
        params = tf.trainable_variables(scope='model')
        param_norm = tf.global_norm(params)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           epsilon=1e-5)
        if use_mixed_precision:
            try:
                optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if self.use_hvd:
            optimizer = hvd.DistributedOptimizer(
                optimizer, sparse_as_dense=use_sparse_as_dense)
            barrier_op = hvd.allreduce(tf.Variable(0.))
            self.barrier = lambda: self._sess.run(barrier_op)
        train_loss = tf.reduce_mean(model.loss.total_il_loss *
                                    self.data_pool.train_batch_weight)
        val_loss = tf.reduce_mean(model_val.loss.total_il_loss *
                                  self.data_pool.val_batch_weight)
        if hasattr(net_config, 'weight_decay') and not net_config.weight_decay:
            # None or 0.0
            total_loss = train_loss
        else:
            total_loss = train_loss + model.loss.total_reg_loss
        grads_and_vars = optimizer.compute_gradients(total_loss, params)
        clip_vars = model.vars.lstm_vars
        clip_grads = [grad for grad, var in grads_and_vars if var in clip_vars]
        nonclip_grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                                  if var not in clip_vars]
        if max_clip_grad_norm > 0:
            clip_grads, clip_grad_norm = tf.clip_by_global_norm(
                clip_grads, max_clip_grad_norm)
        else:
            clip_grad_norm = tf.global_norm(clip_grads)
        clip_grads_and_var = list(zip(clip_grads, clip_vars))
        grads_and_vars = clip_grads_and_var + nonclip_grads_and_vars
        grad_norm = tf.global_norm(list(zip(*grads_and_vars))[0])

        train_op = optimizer.apply_gradients(grads_and_vars)
        tf.global_variables_initializer().run(session=self._sess)

        self.new_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in params
        ]
        self.param_assign_ops = [
            p.assign(new_p) for p, new_p in zip(params, self.new_params)
        ]
        opt_params = optimizer.variables()
        self.new_opt_params = [
            tf.placeholder(p.dtype, shape=p.get_shape()) for p in opt_params
        ]
        self.opt_param_assign_ops = [
            p.assign(new_p)
            for p, new_p in zip(opt_params, self.new_opt_params)
        ]

        def read_params():
            return self._sess.run(params)

        def read_opt_params():
            return self._sess.run(opt_params)

        def load_model(np_new_params):
            self._sess.run(
                self.param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_params, np_new_params)
                })

        def restore_optimizer(np_new_opt_params):
            self._sess.run(
                self.opt_param_assign_ops,
                feed_dict={
                    p: np_p
                    for p, np_p in zip(self.new_opt_params, np_new_opt_params)
                })

        def _train_step():
            return self._sess.run([
                train_loss_aggregated, *train_other_losses_aggregated,
                grad_norm, clip_grad_norm, param_norm, train_op
            ], {})[:-1]

        def _val_step():
            # maximal_feat = [tf.reduce_max(tf.cast(x, tf.float32))
            # for x in self.data_pool.val_batch_input.X]
            # print(self._sess.run(maximal_feat, {}))
            return self._sess.run([
                val_loss_aggregated, *val_other_losses_aggregated,
                *endpoints_aggregated
            ], {})

        self._saver = ChkptsFromSelf(read_params, load_model, self.model_key)

        if restore_checkpoint_path is not None:
            self._saver._restore_model_checkpoint(restore_checkpoint_path)

        if after_loading_init_scope is not None:
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=after_loading_init_scope)
            logger.log('perform after loading init for vars')
            for v in var_list:
                logger.log(v)
            tf.variables_initializer(var_list).run(session=self._sess)

        if self.use_hvd:
            hvd.broadcast_global_variables(0).run(session=self._sess)

        _allreduce = lambda x: x if not self.use_hvd else hvd.allreduce(x)
        train_loss_aggregated = _allreduce(train_loss)
        train_other_loss_names = model.loss.loss_endpoints.keys()
        train_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.train_batch_weight))
            for l in model.loss.loss_endpoints.values()
        ]
        val_loss_aggregated = _allreduce(val_loss)
        val_other_loss_names = model_val.loss.loss_endpoints.keys()
        val_other_losses_aggregated = [
            _allreduce(tf.reduce_mean(l * self.data_pool.val_batch_weight))
            for l in model_val.loss.loss_endpoints.values()
        ]
        endpoints_names = model_val.endpoints.keys()
        endpoints_aggregated = [
            _allreduce(tf.reduce_mean(l))
            for l in model_val.endpoints.values()
        ]
        self._sess.graph.finalize()
        self._total_samples = lambda: [
            self.data_pool._num_train_samples, self.data_pool._num_val_samples
        ]
        self._train_log_names = (['loss'] + list(train_other_loss_names) +
                                 ['grad_norm', 'clip_grad_norm', 'param_norm'])
        self._val_log_names = (['loss'] + list(val_other_loss_names) +
                               list(endpoints_names))
        self._batch_size = batch_size
        self._train_step = _train_step
        self._val_step = _val_step
        self._print_interval = print_interval
        self._checkpoint_interval = checkpoint_interval
        self._num_val_batches = num_val_batches
        self._checkpoints_dir = checkpoints_dir if self.rank == 0 else None
        self._num_sgd_updates = num_sgd_updates
        self.load_model = load_model
        self.restore_optimizer = restore_optimizer
        self.read_params = read_params
        self.read_opt_params = read_opt_params

        format_strs = ['log', 'tensorboard', 'csv']
        logger.configure(dir='training_log/rank{}'.format(self.rank),
                         format_strs=['stdout'] + format_strs)
        with logger.scoped_configure(dir='validation_log/rank{}'.format(
                self.rank),
                                     format_strs=['stderr'] + format_strs):
            self.val_logger = logger.Logger.CURRENT
Esempio n. 5
0
    def __init__(self,
                 port,
                 model_pool_addrs,
                 mutable_hyperparam_type,
                 hyperparam_config_name=None,
                 restore_checkpoint_dir=None,
                 save_checkpoint_root=None,
                 save_interval_secs=3600,
                 game_mgr_type='tleague.game_mgr.game_mgrs.RandomGameMgr',
                 game_mgr_config=None,
                 mute_actor_msg=False,
                 verbose=0,
                 init_model_paths=None,
                 save_learner_meta=False):
        super(LeagueMgr, self).__init__(port,
                                        model_pool_addrs,
                                        restore_checkpoint_dir,
                                        save_checkpoint_root,
                                        save_interval_secs,
                                        mute_actor_msg,
                                        save_learner_meta,
                                        verbose=verbose)
        logger.set_level(verbose)
        logger.configure(dir='league_log/', format_strs=['stdout', 'log'])

        self._game_mgr_type = game_mgr_type
        game_mgr_cls = import_module_or_data(game_mgr_type)
        logger.log('__init__: game_mgr_type: {}'.format(game_mgr_type))
        game_mgr_config = game_mgr_config or {}
        game_mgr_config['pgn_file'] = (game_mgr_config.get('pgn_file', None)
                                       or 'example.pgn')
        game_mgr_config['verbose'] = (game_mgr_config.get('verbose', None)
                                      or verbose)
        logger.log('__init__: game_mgr_config: {}'.format(game_mgr_config))
        self.game_mgr = game_mgr_cls(**game_mgr_config)

        logger.log(
            '__init__: hyperparam_mgr: {}, hyperparam_config: {}'.format(
                mutable_hyperparam_type, hyperparam_config_name))
        self._hyper_mgr = HyperparamMgr(self._model_pool_apis,
                                        mutable_hyperparam_type,
                                        hyperparam_config_name)

        self.init_model_keys = []
        if init_model_paths is not None:
            assert isinstance(init_model_paths, list)
            logger.log(
                '__init__: init_model from paths {}:'.format(init_model_paths))
            for idx, key_path in enumerate(init_model_paths):
                im_key, model_path = key_path
                with open(model_path, 'rb') as f:
                    model = pickle.load(f)
                    if not im_key.startswith('None:'):
                        key = 'None:' + im_key
                    else:
                        key = im_key
                    if hasattr(model, 'key'):
                        logger.log(
                            '__init__: init_model key {} stored in its model '
                            'has been renamed as {}'.format(model.key, key))
                    if hasattr(model, 'model'):
                        model = model.model
                    hyperparam = None
                    # specify init_model's hyperparam if possible
                    if 'lrn_id_list' in game_mgr_config:
                        hyperparam = self._hyper_mgr._default_hyperparam(
                            learner_id=game_mgr_config['lrn_id_list'][idx])
                        logger.log(
                            '__init__: init model {} has been bound with '
                            'hyperparam {}'.format(key, hyperparam))
                    t = time.strftime('%Y%m%d%H%M%S')
                    self._model_pool_apis.push_model(model, hyperparam, key, t,
                                                     t, t)
                f.close()
                logger.log(
                    '__init__: done pushing {} to model pool'.format(key))
                self.game_mgr.add_player(p=key, parent_p=None)
                logger.log(
                    '__init__: done adding player {} to game mgr'.format(key))
                self.init_model_keys.append(key)
        else:
            logger.log('__init__: init_model is None.')