Exemple #1
0
class Policy(Base):
    def __init__(self,
                 a_dim_or_list,
                 action_type,
                 base_dir,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 gamma,
                 max_episode,
                 policy_mode=None,
                 batch_size=1,
                 buffer_size=1,
                 use_priority=False,
                 n_step=False):
        super().__init__(a_dim_or_list=a_dim_or_list,
                         action_type=action_type,
                         base_dir=base_dir)
        self.s_dim = s_dim
        self.visual_sources = visual_sources
        self.visual_dim = [visual_sources, *visual_resolution
                           ] if visual_sources else [0]
        self.a_dim_or_list = a_dim_or_list
        self.gamma = gamma
        self.max_episode = max_episode
        self.policy_mode = policy_mode
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.use_priority = use_priority
        self.n_step = n_step
        self.init_data_memory()
        self.init_placeholders()

    def init_data_memory(self):
        '''
        the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension
        of 'r' and 'done'.
        'ON' mode means program will call on_store function and use pandas dataframe to store data.
        'OFF' mode will call off_store function and use replay buffer to store data.
        '''
        if self.policy_mode == 'ON':
            self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done'])
        elif self.policy_mode == 'OFF':
            if self.use_priority:
                if self.n_step:
                    print('N-Step PER')
                    self.data = NStepPrioritizedExperienceReplay(
                        self.batch_size,
                        self.buffer_size,
                        max_episode=self.max_episode,
                        gamma=self.gamma,
                        alpha=er_config['nper_config']['alpha'],
                        beta=er_config['nper_config']['beta'],
                        epsilon=er_config['nper_config']['epsilon'],
                        agents_num=er_config['nper_config']['max_agents'],
                        n=er_config['nper_config']['n'],
                        global_v=er_config['nper_config']['global_v'])
                else:
                    print('PER')
                    self.data = PrioritizedExperienceReplay(
                        self.batch_size,
                        self.buffer_size,
                        max_episode=self.max_episode,
                        alpha=er_config['per_config']['alpha'],
                        beta=er_config['per_config']['beta'],
                        epsilon=er_config['per_config']['epsilon'],
                        global_v=er_config['nper_config']['global_v'])
            else:
                if self.n_step:
                    print('N-Step ER')
                    self.data = NStepExperienceReplay(
                        self.batch_size,
                        self.buffer_size,
                        gamma=self.gamma,
                        agents_num=er_config['ner_config']['max_agents'],
                        n=er_config['ner_config']['n'])
                else:
                    print('ER')
                    self.data = ExperienceReplay(self.batch_size,
                                                 self.buffer_size)
        else:
            raise Exception('Please specific a mode of policy!')

    def init_placeholders(self):
        with self.graph.as_default():
            self.pl_s = tf.placeholder(tf.float32, [None, self.s_dim],
                                       'vector_observation')
            self.pl_a = tf.placeholder(tf.float32, [None, self.a_counts],
                                       'pl_action')
            self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward')
            self.pl_s_ = tf.placeholder(tf.float32, [None, self.s_dim],
                                        'next_state')
            self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done')
            self.pl_visual_s = tf.placeholder(tf.float32,
                                              [None] + self.visual_dim,
                                              'visual_observation_')
            self.pl_visual_s_ = tf.placeholder(tf.float32,
                                               [None] + self.visual_dim,
                                               'next_visual_observation')

    def on_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas.
        """
        assert isinstance(
            a, np.ndarray), "on_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "on_store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "on_store need done type is np.ndarray"
        self.data = self.data.append(
            {
                's': s,
                'visual_s': visual_s,
                'a': a,
                'r': r,
                's_': s_,
                'visual_s_': visual_s_,
                'done': done
            },
            ignore_index=True)

    def off_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(
            a, np.ndarray), "off_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "off_store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "off_store need done type is np.ndarray"
        self.data.add(s, visual_s, a, r, s_, visual_s_, done)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "no_op_store need done type is np.ndarray"
        if self.policy_mode == 'OFF':
            self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                          done[:, np.newaxis])

    def clear(self):
        """
        clear the DataFrame.
        """
        self.data.drop(self.data.index, inplace=True)

    def get_max_episode(self):
        """
        get the max episode of this training model.
        """
        return self.max_episode
Exemple #2
0
class Agent:
    def __init__(self, env_args: Config, model_args: Config,
                 buffer_args: Config, train_args: Config):
        # print("89898989")
        self.env_args = env_args
        self.model_args = model_args
        self.buffer_args = buffer_args
        self.train_args = train_args
        self.use_GCN = False
        self.model_index = str(self.train_args.get('index'))
        self.all_learner_print = bool(
            self.train_args.get('all_learner_print', False))
        if '-' not in self.train_args['name']:
            self.train_args['name'] += f'-{self.model_index}'
        if self.model_args['load'] is None:
            self.train_args['load_model_path'] = os.path.join(
                self.train_args['base_dir'], self.train_args['name'])
        else:
            if '/' in self.model_args['load'] or '\\' in self.model_args[
                    'load']:  # 所有训练进程都以该模型路径初始化,绝对路径
                self.train_args['load_model_path'] = self.model_args['load']
            elif '-' in self.model_args['load']:
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'])  # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径
            else:  # 只写load的训练名称,不用带进程序号,会自动补
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'] + f'-{self.model_index}')

        # ENV

        self.env = make_env(self.env_args.to_dict, self.use_GCN)

        # ALGORITHM CONFIG
        Model, algorithm_config, _policy_mode = get_model_info(
            self.model_args['algo'])

        self.model_args['policy_mode'] = _policy_mode
        if self.model_args['algo_config'] is not None:
            algorithm_config = UpdateConfig(algorithm_config,
                                            self.model_args['algo_config'],
                                            'algo')
        ShowConfig(algorithm_config)

        # BUFFER
        if _policy_mode == 'off-policy':
            self.buffer_args['batch_size'] = algorithm_config['batch_size']
            self.buffer_args['buffer_size'] = algorithm_config['buffer_size']
            if self.model_args['algo'] in ['drqn', 'drdqn']:
                self.buffer_args['type'] = 'EpisodeER'
            else:
                _use_priority = algorithm_config.get('use_priority', False)
                _n_step = algorithm_config.get('n_step', False)
                if _use_priority and _n_step:
                    self.buffer_args['type'] = 'NstepPER'
                    self.buffer_args['NstepPER'][
                        'max_episode'] = self.train_args['max_episode']
                    self.buffer_args['NstepPER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'], self.buffer_args['NstepPER']
                        ['n'])  # update gamma for n-step training.
                elif _use_priority:
                    self.buffer_args['type'] = 'PER'
                    self.buffer_args['PER']['max_episode'] = self.train_args[
                        'max_episode']
                elif _n_step:
                    self.buffer_args['type'] = 'NstepER'
                    self.buffer_args['NstepER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'],
                        self.buffer_args['NstepER']['n'])
                else:
                    self.buffer_args['type'] = 'ER'
        else:
            self.buffer_args['type'] = 'Pandas'

        # MODEL
        base_dir = os.path.join(
            self.train_args['base_dir'], self.train_args['name']
        )  # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME
        if 'batch_size' in algorithm_config.keys() and train_args['fill_in']:
            self.train_args['pre_fill_steps'] = algorithm_config['batch_size']

        if self.env_args['type'] == 'gym':
            self.eval_env_args = deepcopy(self.env_args)
            self.eval_env_args.env_num = 1
            self.eval_env = make_env(self.eval_env_args.to_dict)
            # buffer ------------------------------
            if 'Nstep' in self.buffer_args[
                    'type'] or 'Episode' in self.buffer_args['type']:
                self.buffer_args[self.buffer_args['type']][
                    'agents_num'] = self.env_args['env_num']
            self.buffer = get_buffer(self.buffer_args)
            # buffer ------------------------------

            # model -------------------------------
            model_params = {
                's_dim': self.env.s_dim,
                'visual_sources': self.env.visual_sources,
                'visual_resolution': self.env.visual_resolution,
                'a_dim_or_list': self.env.a_dim_or_list,
                'is_continuous': self.env.is_continuous,
                'max_episode': self.train_args.max_episode,
                'base_dir': base_dir,
                'logger2file': self.model_args.logger2file,
                'seed': self.model_args.seed
            }
            self.model = Model(**model_params, **algorithm_config)
            self.model.set_buffer(self.buffer)
            self.model.init_or_restore(self.train_args['load_model_path'])
            # model -------------------------------

            self.train_args['begin_episode'] = self.model.get_init_episode()
            if not self.train_args['inference']:
                records_dict = {
                    'env': self.env_args.to_dict,
                    'model': self.model_args.to_dict,
                    'buffer': self.buffer_args.to_dict,
                    'train': self.train_args.to_dict,
                    'algo': algorithm_config
                }
                save_config(os.path.join(base_dir, 'config'), records_dict)
        else:
            # buffer -----------------------------------
            self.buffer_args_s = []
            for i in range(self.env.brain_num):
                _bargs = deepcopy(self.buffer_args)
                if 'Nstep' in _bargs['type'] or 'Episode' in _bargs['type']:
                    _bargs[_bargs['type']][
                        'agents_num'] = self.env.brain_agents[i]
                self.buffer_args_s.append(_bargs)
            buffers = [
                get_buffer(self.buffer_args_s[i])
                for i in range(self.env.brain_num)
            ]
            # buffer -----------------------------------

            # model ------------------------------------
            self.model_args_s = []
            for i in range(self.env.brain_num):
                _margs = deepcopy(self.model_args)
                _margs['seed'] = self.model_args['seed'] + i * 10
                self.model_args_s.append(_margs)
            model_params = [
                {
                    's_dim': self.env.s_dim[i],
                    'a_dim_or_list': self.env.a_dim_or_list[i],
                    'visual_sources': self.env.visual_sources[i],
                    'visual_resolution': self.env.visual_resolutions[i],
                    'is_continuous': self.env.is_continuous[i],
                    'max_episode': self.train_args.max_episode,
                    'base_dir': os.path.join(base_dir, b),
                    'logger2file': self.model_args_s[i].logger2file,
                    'seed': self.model_args_s[i].
                    seed,  # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                } for i, b in enumerate(self.env.brain_names)
            ]

            # multi agent training------------------------------------
            if self.model_args['algo'][:3] == 'ma_':
                self.ma = True
                assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
                self.ma_data = ExperienceReplay(batch_size=10, capacity=1000)
                [
                    mp.update({
                        'n': self.env.brain_num,
                        'i': i
                    }) for i, mp in enumerate(model_params)
                ]
            else:
                self.ma = False
            # multi agent training------------------------------------

            self.models = [
                Model(**model_params[i], **algorithm_config)
                for i in range(self.env.brain_num)
            ]

            [
                model.set_buffer(buffer)
                for model, buffer in zip(self.models, buffers)
            ]
            [
                self.models[i].init_or_restore(
                    os.path.join(self.train_args['load_model_path'], b))
                for i, b in enumerate(self.env.brain_names)
            ]
            # model ------------------------------------
            self.train_args['begin_episode'] = self.models[0].get_init_episode(
            )
            if not self.train_args['inference']:
                for i, b in enumerate(self.env.brain_names):
                    records_dict = {
                        'env': self.env_args.to_dict,
                        'model': self.model_args_s[i].to_dict,
                        'buffer': self.buffer_args_s[i].to_dict,
                        'train': self.train_args.to_dict,
                        'algo': algorithm_config
                    }
                    save_config(os.path.join(base_dir, b, 'config'),
                                records_dict)
        # print("21323232323")

    def pwi(self, *args):
        if self.all_learner_print:
            print(f'| Model-{self.model_index} |', *args)
        elif int(self.model_index) == 0:
            print(f'|#ONLY#Model-{self.model_index} |', *args)

    def __call__(self):
        self.train()

    def train(self):
        if self.env_args['type'] == 'gym':
            try:
                self.gym_no_op()
                self.gym_train()
            finally:
                self.model.close()
                self.env.close()
        else:
            try:
                if self.ma:
                    self.ma_unity_no_op()
                    self.ma_unity_train()
                else:

                    self.unity_no_op()
                    self.unity_train()
            finally:
                [model.close() for model in self.models]
                self.env.close()

    def evaluate(self):
        if self.env_args['type'] == 'gym':
            self.gym_inference()
        else:
            if self.ma:
                self.ma_unity_inference()
            else:
                self.unity_inference()

    def init_variables(self, evaluate=False):
        """
        inputs:
            env: Environment
        outputs:
            i: specify which item of state should be modified
            state: [vector_obs, visual_obs]
            newstate: [vector_obs, visual_obs]
        """
        if evaluate:
            env = self.eval_env
        else:
            env = self.env
        i = 1 if env.obs_type == 'visual' else 0
        return i, [np.array([[]] * env.n),
                   np.array([[]] * env.n)
                   ], [np.array([[]] * env.n),
                       np.array([[]] * env.n)]

    def gym_train(self):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
            policy_mode:        'on-policy' or 'off-policy'
        """
        begin_episode = int(self.train_args['begin_episode'])
        render = bool(self.train_args['render'])
        render_episode = int(self.train_args.get('render_episode', 50000))
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        eval_while_train = bool(self.train_args['eval_while_train'])
        max_eval_episode = int(self.train_args.get('max_eval_episode'))
        off_policy_step_eval = bool(self.train_args['off_policy_step_eval'])
        off_policy_step_eval_num = int(
            self.train_args.get('off_policy_step_eval_num'))
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        total_step_control = bool(self.train_args['total_step_control'])
        max_total_step = int(self.train_args['max_total_step'])
        if total_step_control:
            max_episode = max_total_step

        i, state, new_state = self.init_variables()
        sma = SMA(moving_average_episode)
        total_step = 0
        for episode in range(begin_episode, max_episode):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            step = 0
            r = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(self.env.n)
                if render or episode > render_episode:
                    self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
                new_state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                self.model.store_data(s=state[0],
                                      visual_s=state[1],
                                      a=action,
                                      r=reward,
                                      s_=new_state[0],
                                      visual_s_=new_state[1],
                                      done=done)

                if policy_mode == 'off-policy':
                    self.model.learn(episode=episode, step=1)
                    if off_policy_step_eval:
                        self.gym_step_eval(total_step, self.model,
                                           off_policy_step_eval_num, max_step)
                total_step += 1
                if total_step_control and total_step > max_total_step:
                    return

                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    new_state[i][
                        self.env.dones_index] = self.env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            if policy_mode == 'on-policy':
                self.model.learn(episode=episode, step=step)
            self.model.writer_summary(episode,
                                      reward_mean=r.mean(),
                                      reward_min=r.min(),
                                      reward_max=r.max(),
                                      step=last_done_step,
                                      **sma.rs)
            self.pwi('-' * 40)
            self.pwi(
                f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}'
            )
            if episode % save_frequency == 0:
                self.model.save_checkpoint(episode)

            if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                self.gym_random_sample(steps=add_noise2buffer_steps)

            if eval_while_train and self.env.reward_threshold is not None:
                if r.max() >= self.env.reward_threshold:
                    self.pwi(
                        f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                    )
                    self.gym_evaluate()

    def gym_step_eval(self, idx, model, episodes_num, max_step):
        i, state, _ = self.init_variables(evaluate=True)
        ret = 0.
        ave_steps = 0.
        for _ in range(episodes_num):
            state[i] = self.eval_env.reset()
            r = 0.
            step = 0
            while True:
                action = model.choose_action(s=state[0],
                                             visual_s=state[1],
                                             evaluation=True)
                state[i], reward, done, info = self.eval_env.step(action)
                reward = reward[0]
                done = done[0]
                r += reward
                step += 1
                if done or step > max_step:
                    ret += r
                    ave_steps += step
                    break
        model.writer_summary(
            idx,
            eval_return=ret / episodes_num,
            eval_ave_step=ave_steps // episodes_num,
        )

    def gym_random_sample(self, steps):
        i, state, new_state = self.init_variables()
        state[i] = self.env.reset()

        for _ in range(steps):
            action = self.env.sample_actions()
            new_state[i], reward, done, info = self.env.step(action)
            self.model.no_op_store(s=state[0],
                                   visual_s=state[1],
                                   a=action,
                                   r=reward,
                                   s_=new_state[0],
                                   visual_s_=new_state[1],
                                   done=done)
            if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                new_state[i][self.env.dones_index] = self.env.partial_reset()
            state[i] = new_state[i]
        self.pwi('Noise added complete.')

    def gym_evaluate(self):
        max_step = int(self.train_args['max_step'])
        max_eval_episode = int(self.train_args['max_eval_eposide'])
        i, state, _ = self.init_variables()
        total_r = np.zeros(self.env.n)
        total_steps = np.zeros(self.env.n)
        episodes = max_eval_episode // self.env.n
        for _ in range(episodes):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            steps = np.zeros(self.env.n)
            r = np.zeros(self.env.n)
            while True:
                r_tem = np.zeros(self.env.n)
                action = self.model.choose_action(
                    s=state[0], visual_s=state[1], evaluation=True
                )  # In the future, this method can be combined with choose_action
                state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                steps[unfinished_index] += 1
                r += r_tem
                if all(dones_flag) or any(steps >= max_step):
                    break
            total_r += r
            total_steps += steps
        average_r = total_r.mean() / episodes
        average_step = int(total_steps.mean() / episodes)
        solved = True if average_r >= self.env.reward_threshold else False
        self.pwi(
            f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}'
        )
        self.pwi(
            '----------------------------------------------------------------------------------------------------------------------------'
        )

    def gym_no_op(self):
        steps = self.train_args['pre_fill_steps']
        choose = self.train_args['prefill_choose']
        assert isinstance(
            steps, int
        ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'

        i, state, new_state = self.init_variables()

        state[i] = self.env.reset()

        steps = steps // self.env.n + 1

        for step in range(steps):
            self.pwi(f'no op step {step}')
            if choose:
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
            else:
                action = self.env.sample_actions()
            new_state[i], reward, done, info = self.env.step(action)
            self.model.no_op_store(s=state[0],
                                   visual_s=state[1],
                                   a=action,
                                   r=reward,
                                   s_=new_state[0],
                                   visual_s_=new_state[1],
                                   done=done)
            if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                new_state[i][self.env.dones_index] = self.env.partial_reset()
            state[i] = new_state[i]

    def gym_inference(self):
        i, state, _ = self.init_variables()
        while True:
            state[i] = self.env.reset()
            while True:
                self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1],
                                                  evaluation=True)
                state[i], reward, done, info = self.env.step(action)
                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    state[i][self.env.dones_index] = self.env.partial_reset()

    def unity_train(self):
        """
        Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
        Inputs:
            env:                    Environment for interaction.
            models:                 all models for this trianing task.
            save_frequency:         how often to save checkpoints.
            reset_config:           configuration to reset for Unity environment.
            max_step:               maximum number of steps for an episode.
            sampler_manager:        sampler configuration parameters for 'reset_config'.
            resampling_interval:    how often to resample parameters for env reset.
        Variables:
            brain_names:    a list of brain names set in Unity.
            state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
            visual_state:   store a list of visual state information for each brain.
            action:         store a list of actions for each brain.
            dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
            rewards:        use to record rewards of agents for each brain.
        """
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        if self.use_GCN:
            adj, x, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 6)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            adj=adj[i], x=x[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data_gcn(adj=adj[i],
                                                      x=x[i],
                                                      visual_s=visual_state[i],
                                                      a=action[i],
                                                      r=_r,
                                                      adj_=_adj,
                                                      x_=_x,
                                                      visual_s_=_vs,
                                                      done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        adj[i] = _adj
                        x[i] = _x
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)

        else:

            state, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 5)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    state[i] = _v
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            s=state[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data(s=state[i],
                                                  visual_s=visual_state[i],
                                                  a=action[i],
                                                  r=_r,
                                                  s_=_v,
                                                  visual_s_=_vs,
                                                  done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        state[i] = _v
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)

    def unity_random_sample(self, steps):
        if self.use_GCN:
            adj, x, visual_state = zeros_initializer(self.env.brain_num, 3)

            ObsRewDone = self.env.reset()
            for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                adj[i] = _adj
                x[i] = _x
                visual_state[i] = _vs

            for _ in range(steps):
                action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].store_data_gcn(adj=adj[i],
                                                  x=x[i],
                                                  visual_s=visual_state[i],
                                                  a=action[i],
                                                  r=_r,
                                                  adj_=_adj,
                                                  x_=_x,
                                                  visual_s_=_vs,
                                                  done=_d)
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs
            self.pwi('Noise added complete.')
        else:
            state, visual_state = zeros_initializer(self.env.brain_num, 2)

            ObsRewDone = self.env.reset()
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                state[i] = _v
                visual_state[i] = _vs

            for _ in range(steps):
                action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].store_data(s=state[i],
                                              visual_s=visual_state[i],
                                              a=action[i],
                                              r=_r,
                                              s_=_v,
                                              visual_s_=_vs,
                                              done=_d)
                    state[i] = _v
                    visual_state[i] = _vs
            self.pwi('Noise added complete.')

    def unity_no_op(self):
        '''
        Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer.
        Make sure steps is greater than n-step if using any n-step ReplayBuffer.
        '''

        steps = self.train_args['pre_fill_steps']
        choose = self.train_args['prefill_choose']
        assert isinstance(
            steps, int
        ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'

        if self.use_GCN:
            adj, x, visual_state, action = zeros_initializer(
                self.env.brain_num, 4)
            ObsRewDone = self.env.reset()
            for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                adj[i] = _adj
                x[i] = _x
                visual_state[i] = _vs

            steps = steps // min(self.env.brain_agents) + 1

            for step in range(steps):

                self.pwi(f'no op step {step}')
                if choose:
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            adj=adj[i], x=x, visual_s=visual_state[i])
                else:
                    action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                print(
                    "77777777777777777777777777777777777777777777777777777777777777777"
                )
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].no_op_store_gcn(adj=adj[i],
                                                   x=x[i],
                                                   visual_s=visual_state[i],
                                                   a=action[i],
                                                   r=_r,
                                                   adj_=_adj,
                                                   x_=_x,
                                                   visual_s_=_vs,
                                                   done=_d)
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs

        else:
            state, visual_state, action = zeros_initializer(
                self.env.brain_num, 3)
            ObsRewDone = self.env.reset()
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                state[i] = _v
                visual_state[i] = _vs

            steps = steps // min(self.env.brain_agents) + 1

            for step in range(steps):
                self.pwi(f'no op step {step}')
                if choose:
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            s=state[i], visual_s=visual_state[i])
                else:
                    action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].no_op_store(s=state[i],
                                               visual_s=visual_state[i],
                                               a=action[i],
                                               r=_r,
                                               s_=_v,
                                               visual_s_=_vs,
                                               done=_d)
                    state[i] = _v
                    visual_state[i] = _vs

    def unity_inference(self):
        """
        inference mode. algorithm model will not be train, only used to show agents' behavior
        """
        if self.use_GCN:
            action = zeros_initializer(self.env.brain_num, 1)
            while True:
                ObsRewDone = self.env.reset()
                while True:
                    for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                        action[i] = self.models[i].choose_action(
                            adj=_adj, x=_x, visual_s=_vs, evaluation=True)
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)
        else:

            action = zeros_initializer(self.env.brain_num, 1)
            while True:
                ObsRewDone = self.env.reset()
                while True:
                    for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                        action[i] = self.models[i].choose_action(
                            s=_v, visual_s=_vs, evaluation=True)
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

    def ma_unity_no_op(self):
        steps = self.train_args['pre_fill_steps']
        choose = self.train_args['prefill_choose']
        assert isinstance(steps,
                          int), 'multi-agent no_op.steps must have type of int'

        if steps < self.ma_data.batch_size:
            steps = self.ma_data.batch_size
        state, action, reward, next_state, dones = zeros_initializer(
            self.env.brain_num, 5)
        ObsRewDone = self.env.reset(train_mode=False)
        for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
            state[i] = _v

        for i in range(self.env.brain_num):
            # initialize actions to zeros
            if self.env.is_continuous[i]:
                action[i] = np.zeros(
                    (self.env.brain_agents[i], self.env.a_dim_or_list[i][0]),
                    dtype=np.int32)
            else:
                action[i] = np.zeros(
                    (self.env.brain_agents[i], len(self.env.a_dim_or_list[i])),
                    dtype=np.int32)

        a = [np.asarray(e) for e in zip(*action)]
        for step in range(steps):
            self.pwi(f'no op step {step}')
            for i in range(self.env.brain_num):
                if choose:
                    action[i] = self.models[i].choose_action(s=state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            ObsRewDone = self.env.step(vector_action=actions)
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                reward[i] = _r[:, np.newaxis]
                next_state[i] = _vs
                dones[i] = _d[:, np.newaxis]

            def func(x):
                return [np.asarray(e) for e in zip(*x)]

            s, a, r, s_, done = map(func,
                                    [state, action, reward, next_state, dones])
            self.ma_data.add(s, a, r, s_, done)
            for i in range(self.env.brain_num):
                state[i] = next_state[i]

    def ma_unity_train(self):
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only."

        batch_size = self.ma_data.batch_size
        state, action, new_action, next_action, reward, next_state, dones, dones_flag, rewards = zeros_initializer(
            self.env.brain_num, 9)

        for episode in range(begin_episode, max_episode):
            ObsRewDone = self.env.reset()
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                dones_flag[i] = np.zeros(self.env.brain_agents[i])
                rewards[i] = np.zeros(self.env.brain_agents[i])
                state[i] = _v
            step = 0
            last_done_step = -1
            while True:
                step += 1
                for i in range(self.env.brain_num):
                    action[i] = self.models[i].choose_action(s=state[i])
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)

                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    reward[i] = _r[:, np.newaxis]
                    next_state = _v
                    dones[i] = _d[:, np.newaxis]
                    unfinished_index = np.where(dones_flag[i] == False)[0]
                    dones_flag[i] += _d
                    rewards[i][unfinished_index] += _r[unfinished_index]

                def func(x):
                    return [np.asarray(e) for e in zip(*x)]

                s, a, r, s_, done = map(
                    func, [state, action, reward, next_state, dones])
                self.ma_data.add(s, a, r, s_, done)

                for i in range(self.env.brain_num):
                    state[i] = next_state[i]

                s, a, r, s_, done = self.ma_data.sample()
                for i, brain_name in enumerate(self.env.brain_names):
                    next_action[i] = self.models[i].get_target_action(s=s_[:,
                                                                           i])
                    new_action[i] = self.models[i].choose_action(
                        s=s[:, i], evaluation=True)
                a_ = np.asarray([np.asarray(e) for e in zip(*next_action)])
                if policy_mode == 'off-policy':
                    for i in range(self.env.brain_num):
                        self.models[i].learn(
                            episode=episode,
                            ap=np.asarray([
                                np.asarray(e) for e in zip(*next_action[:i])
                            ]).reshape(batch_size, -1) if i != 0 else np.zeros(
                                (batch_size, 0)),
                            al=np.asarray([
                                np.asarray(e) for e in zip(
                                    *next_action[-(self.env.brain_num - i -
                                                   1):])
                            ]).reshape(batch_size, -1)
                            if self.env.brain_num - i != 1 else np.zeros(
                                (batch_size, 0)),
                            ss=s.reshape(batch_size, -1),
                            ss_=s_.reshape(batch_size, -1),
                            aa=a.reshape(batch_size, -1),
                            aa_=a_.reshape(batch_size, -1),
                            s=s[:, i],
                            r=r[:, i])

                if all([all(dones_flag[i])
                        for i in range(self.env.brain_num)]):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

            for i in range(self.env.brain_num):
                self.models[i].writer_summary(episode,
                                              total_reward=rewards[i].mean(),
                                              step=last_done_step)
            self.pwi('-' * 40)
            self.pwi(
                f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}'
            )
            if episode % save_frequency == 0:
                for i in range(self.env.brain_num):
                    self.models[i].save_checkpoint(episode)

    def ma_unity_inference(self):
        """
        inference mode. algorithm model will not be train, only used to show agents' behavior
        """
        action = zeros_initializer(self.env.brain_num, 1)
        while True:
            ObsRewDone = self.env.reset()
            while True:
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    action[i] = self.models[i].choose_action(s=_v,
                                                             evaluation=True)
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
Exemple #3
0
class MCTS_POLICY(RL_Policy):
    def __init__(self,
                 state_dim,
                 learning_rate=5.0e-4,
                 buffer_size=10000,
                 batch_size=128,
                 epochs=2,
                 name='wjs_policy',
                 cp_dir='./models'):
        super().__init__(cp_dir=cp_dir)
        self.lr = learning_rate
        self.epochs = epochs
        self.data = ExperienceReplay(batch_size=batch_size,
                                     capacity=buffer_size)
        self.net = PV(state_dim=state_dim, name='pv_net')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)

    @tf.function
    def _get_probs_and_v(self, state):
        with tf.device(self.device):
            state = tf.transpose(state, [0, 2, 3, 1])
            return self.net(state)

    def get_probs_and_v(self, game):
        '''
        输入状态,获得相应可选动作的概率和当前节点的预期价值
        '''
        state = game.get_current_state().reshape(-1, 4, game.box_size,
                                                 game.box_size)
        log_actions_prob, value = self._get_probs_and_v(state)
        actions_prob = np.exp(log_actions_prob)
        a, b = game.get_available_actions()
        available_actions_prob = zip(a, actions_prob[0][b])
        return available_actions_prob, value

    def learn(self):
        if self.data.is_lg_batch_size:
            s, p, v = self.data.sample()
            for i in range(self.epochs):
                summaries = self.train(s, p, v)
                loss = summaries['LOSS/loss']
                logging.info(f'epoch: {i}, loss: {loss}')
                tf.summary.experimental.set_step(self.global_step)
                self.write_training_summaries(summaries)
                tf.summary.scalar('LEARNING_RATE/lr', self.lr)
                self.writer.flush()

    @tf.function
    def train(self, s, p, v):
        s = tf.cast(s, tf.float32)
        p = tf.cast(p, tf.float32)
        v = tf.cast(v, tf.float32)
        with tf.device(self.device):
            s = tf.transpose(s, [0, 2, 3, 1])
            with tf.GradientTape() as tape:
                log_action_probs, predict_v = self.net(s)
                p_loss = -tf.reduce_mean(
                    tf.reduce_sum(tf.multiply(p, log_action_probs), axis=-1))
                v_loss = tf.reduce_mean((v - predict_v)**2)
                l2_penalty = 1e-4 * tf.add_n([
                    tf.nn.l2_loss(v) for v in self.net.trainable_variables
                    if 'bias' not in v.name.lower()
                ])
                loss = v_loss + p_loss + l2_penalty
            grads = tape.gradient(loss, self.net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(grads, self.net.trainable_variables))
            self.global_step.assign_add(1)
            return dict([
                ['LOSS/v_loss', v_loss],
                ['LOSS/p_loss', p_loss],
                ['LOSS/loss', loss],
            ])

    def store(self, data: list):
        for i in data:
            self.data.add(i)

    def store_in_file(self, data, file_name='./data/data'):
        with open(f'{file_name}.data', 'a') as f:
            for i in data:
                json_str = json.dumps([d.tolist() for d in i])  # 将一条经验转换为list
                f.write(json_str + '\n')  # 保存一条经验

    def _restore_from_file(self, data, file_name='./data/data'):
        with open(f'{file_name}.data') as f:
            for json_str in f:  # 每行为一条经验
                if json_str != '':
                    data = json.loads(json_str)
                    data = [np.array(d) for d in data]  # 一条经验
                    self.data.add(data)  # 恢复一条经验
class HIRO(make_off_policy_class(mode='no_share')):
    '''
    Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = rls.ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                      sigma=1.0 *
                                                      np.ones(self.a_dim),
                                                      bound=0.5)

        _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim,
                                                hidden_units['high_actor'])
        if self.is_continuous:
            _low_actor_net = lambda: rls.actor_dpg(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
        else:
            _low_actor_net = lambda: rls.actor_discrete(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        _high_critic_net = lambda: rls.critic_q_one(
            self.s_dim, self.sub_goal_dim, hidden_units['high_critic'])
        _low_critic_net = lambda: rls.critic_q_one(
            self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                'low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        self.update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)

    def generate_ir_func(self, mode='os'):
        if mode == 'os':
            return lambda last_feat, subgoal, feat: -tf.norm(
                last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True)
        elif mode == 'cos':
            return lambda last_feat, subgoal, feat: tf.expand_dims(
                -tf.keras.losses.cosine_similarity(
                    tf.cast(feat - last_feat, tf.float32),
                    tf.cast(subgoal, tf.float32),
                    axis=-1),
                axis=-1)

    def show_logo(self):
        self.recorder.logger.info('''
  xxxxx xxxxx        xxxx        xxxxxxx          xxxxxx    
    xx   xx           xx          xxxxxxx        xxx xxxx   
    xx   xx           xx          xx  xxx       xxx   xxx   
    xx   xx           xx          xx  xxx       xx     xxx  
    xxxxxxx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xx xxxx       xx     xxx  
    xx   xx           xx          xx  xxx       xxx   xxx   
  xxxxx xxxxx        xxxx        xxxxx xxxx      xxxxxxx   
        ''')

    def store_high_buffer(self, i):
        eps_len = len(self._high_s[i])
        intervals = list(range(0, eps_len, self.sub_goal_steps))
        if len(intervals) < 1:
            return
        left = intervals[:-1]
        right = intervals[1:]
        s, r, a, g, d, s_ = [], [], [], [], [], []
        for _l, _r in zip(left, right):
            s.append(self._high_s[i][_l:_r])
            r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale)
            a.append(self._high_a[i][_l:_r])
            g.append(self._subgoals[i][_l])
            d.append(self._done[i][_r - 1])
            s_.append(self._high_s_[i][_r - 1])

        right = intervals[-1]
        s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        r.append(sum(self._high_r[i][right:eps_len]))
        a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        g.append(self._subgoals[i][right])
        d.append(self._done[i][-1])
        s_.append(self._high_s_[i][-1])
        self.data_high.add(np.array(s),
                           np.array(r)[:, np.newaxis], np.array(a),
                           np.array(g),
                           np.array(d)[:, np.newaxis], np.array(s_))

    def reset(self):
        self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32)

        for i in range(self.n_agents):
            self.store_high_buffer(i)
        self._high_r = [[] for _ in range(self.n_agents)]
        self._high_a = [[] for _ in range(self.n_agents)]
        self._high_s = [[] for _ in range(self.n_agents)]
        self._subgoals = [[] for _ in range(self.n_agents)]
        self._done = [[] for _ in range(self.n_agents)]
        self._high_s_ = [[] for _ in range(self.n_agents)]

        self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim),
                                     dtype=np.float32)

    def partial_reset(self, done):
        self._c = np.where(
            done[:, np.newaxis],
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c)
        idx = np.where(done)[0]
        for i in idx:
            self.store_high_buffer(i)
            self._high_s[i] = []
            self._high_a[i] = []
            self._high_s_[i] = []
            self._high_r[i] = []
            self._done[i] = []
            self._subgoals[i] = []

    @tf.function
    def _get_action(self, s, visual_s, subgoal):
        with tf.device(self.device):
            feat = tf.concat([s, subgoal], axis=-1)
            if self.is_continuous:
                mu = self.low_actor(feat)
                pi = tf.clip_by_value(mu + self.low_noise(), -1, 1)
            else:
                logits = self.low_actor(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfd.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi

    def choose_action(self, s, visual_s, evaluation=False):
        self._subgoal = np.where(self._c == self.sub_goal_steps,
                                 self.get_subgoal(s).numpy(),
                                 self._new_subgoal)
        mu, pi = self._get_action(s, visual_s, self._subgoal)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def get_subgoal(self, s):
        '''
        last_s 上一个隐状态
        subgoal 上一个子目标
        s 当前隐状态
        '''
        new_subgoal = self.high_scale * self.high_actor(s)
        new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(),
                                       -self.high_scale, self.high_scale)
        return new_subgoal

    def learn(self, **kwargs):
        self.episode = kwargs['episode']
        for i in range(kwargs['step']):
            if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size:
                self.intermediate_variable_reset()
                low_data = self.get_transitions(
                    self.data_low,
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'])
                high_data = self.get_transitions(
                    self.data_high,
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'])

                # --------------------------------------获取需要传给train函数的参数
                _low_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'],
                    data_dict=low_data)
                _high_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'],
                    data_dict=high_data)
                summaries = self.train_low(_low_training_data)

                self.summaries.update(summaries)
                self.update_target_net_weights(
                    self.low_actor_target.weights +
                    self.low_critic_target.weights,
                    self.low_actor.weights + self.low_critic.weights,
                    self.ployak)
                if self.counts % self.sub_goal_steps == 0:
                    self.counts = 0
                    high_summaries = self.train_high(_high_training_data)
                    self.summaries.update(high_summaries)
                    self.update_target_net_weights(
                        self.high_actor_target.weights +
                        self.high_critic_target.weights,
                        self.high_actor.weights + self.high_critic.weights,
                        self.ployak)
                self.counts += 1
                self.summaries.update(
                    dict([[
                        'LEARNING_RATE/low_actor_lr',
                        self.low_actor_lr(self.episode)
                    ],
                          [
                              'LEARNING_RATE/low_critic_lr',
                              self.low_critic_lr(self.episode)
                          ],
                          [
                              'LEARNING_RATE/high_actor_lr',
                              self.high_actor_lr(self.episode)
                          ],
                          [
                              'LEARNING_RATE/high_critic_lr',
                              self.high_critic_lr(self.episode)
                          ]]))
                self.write_training_summaries(self.global_step, self.summaries)

    @tf.function(experimental_relax_shapes=True)
    def train_low(self, memories):
        s, a, r, s_, done, g, g_ = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = tf.concat([s, g], axis=-1)
                feat_ = tf.concat([s_, g_], axis=-1)

                if self.is_continuous:
                    target_mu = self.low_actor_target(feat_)
                    action_target = tf.clip_by_value(
                        target_mu + self.low_noise(), -1, 1)
                else:
                    target_logits = self.low_actor_target(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [tf.shape(feat_)[0], self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                q1, q2 = self.low_critic(feat, a)
                q = tf.minimum(q1, q2)
                q_target = self.low_critic_target.get_min(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                low_critic_loss = q1_loss + q2_loss
            low_critic_grads = tape.gradient(low_critic_loss,
                                             self.low_critic.weights)
            self.low_critic_optimizer.apply_gradients(
                zip(low_critic_grads, self.low_critic.weights))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.low_actor(feat)
                else:
                    logits = self.low_actor(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_actor = self.low_critic.Q1(feat, mu)
                low_actor_loss = -tf.reduce_mean(q_actor)
            low_actor_grads = tape.gradient(low_actor_loss,
                                            self.low_actor.trainable_variables)
            self.low_actor_optimizer.apply_gradients(
                zip(low_actor_grads, self.low_actor.trainable_variables))

            self.global_step.assign_add(1)
            return dict([['LOSS/low_actor_loss', low_actor_loss],
                         ['LOSS/low_critic_loss', low_critic_loss],
                         ['Statistics/low_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/low_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/low_q_max',
                          tf.reduce_max(q)]])

    @tf.function(experimental_relax_shapes=True)
    def train_high(self, memories):
        # s_ : [B, N]
        ss, r, aa, g, done, s_ = memories

        batchs = tf.shape(ss)[0]
        # ss, aa [B, T, *]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                s = ss[:, 0]  # [B, N]
                true_end = (s_ - s)[:, self.fn_goal_dim:]
                g_dist = tfd.Normal(loc=true_end,
                                    scale=0.5 * self.high_scale[None, :])
                ss = tf.expand_dims(ss, 0)  # [1, B, T, *]
                ss = tf.tile(ss,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]])  # [10*B*T, *]
                aa = tf.expand_dims(aa, 0)  # [1, B, T, *]
                aa = tf.tile(aa,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]])  # [10*B*T, *]
                gs = tf.concat([
                    tf.expand_dims(g, 0),
                    tf.expand_dims(true_end, 0),
                    tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2),
                                     -self.high_scale, self.high_scale)
                ],
                               axis=0)  # [10, B, N]

                all_g = gs + s[:, self.fn_goal_dim:]
                all_g = tf.expand_dims(all_g, 2)  # [10, B, 1, N]
                all_g = tf.tile(
                    all_g, [1, 1, self.sub_goal_steps, 1])  # [10, B, T, N]
                all_g = tf.reshape(all_g,
                                   [-1, tf.shape(all_g)[-1]])  # [10*B*T, N]
                all_g = all_g - ss[:, self.fn_goal_dim:]  # [10*B*T, N]
                feat = tf.concat([ss, all_g], axis=-1)  # [10*B*T, *]
                _aa = self.low_actor(feat)  # [10*B*T, A]
                if not self.is_continuous:
                    _aa = tf.one_hot(tf.argmax(_aa, axis=-1),
                                     self.a_dim,
                                     dtype=tf.float32)
                diff = _aa - aa
                diff = tf.reshape(
                    diff,
                    [self.sample_g_nums, batchs, self.sub_goal_steps, -1
                     ])  # [10, B, T, A]
                diff = tf.transpose(diff, [1, 0, 2, 3])  # [B, 10, T, A]
                logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2,
                                             axis=-1)  # [B, 10]
                idx = tf.argmax(logps, axis=-1, output_type=tf.int32)
                idx = tf.stack([tf.range(batchs), idx], axis=1)  # [B, 2]
                g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx)  # [B, N]

                q1, q2 = self.high_critic(s, g)
                q = tf.minimum(q1, q2)

                target_sub_goal = self.high_actor_target(s_) * self.high_scale
                target_sub_goal = tf.clip_by_value(
                    target_sub_goal + self.high_noise(), -self.high_scale,
                    self.high_scale)
                q_target = self.high_critic_target.get_min(s_, target_sub_goal)

                dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target)
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                high_critic_loss = q1_loss + q2_loss

            high_critic_grads = tape.gradient(high_critic_loss,
                                              self.high_critic.weights)
            self.high_critic_optimizer.apply_gradients(
                zip(high_critic_grads, self.high_critic.weights))
            with tf.GradientTape() as tape:
                mu = self.high_actor(s) * self.high_scale
                q_actor = self.high_critic.Q1(s, mu)
                high_actor_loss = -tf.reduce_mean(q_actor)
            high_actor_grads = tape.gradient(
                high_actor_loss, self.high_actor.trainable_variables)
            self.high_actor_optimizer.apply_gradients(
                zip(high_actor_grads, self.high_actor.trainable_variables))
            return dict([['LOSS/high_actor_loss', high_actor_loss],
                         ['LOSS/high_critic_loss', high_critic_loss],
                         ['Statistics/high_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/high_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/high_q_max',
                          tf.reduce_max(q)]])

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._noop_subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal,
                         s_[:, self.fn_goal_dim:])
        # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:]
        subgoal = np.random.uniform(-self.high_scale,
                                    self.high_scale,
                                    size=(self.n_agents, self.sub_goal_dim))
        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._noop_subgoal,
            subgoal)
        self._noop_subgoal = subgoal

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal,
                         s_[:, self.fn_goal_dim:])
        self._new_subgoal = np.where(
            self._c == 1,
            self.get_subgoal(s_).numpy(),
            s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:])

        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._subgoal,
            self._new_subgoal)
        self._c = np.where(
            self._c == 1,
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c - 1)

    def get_transitions(self,
                        databuffer,
                        data_name_list=['s', 'a', 'r', 's_', 'done']):
        '''
        TODO: Annotation
        '''
        data = databuffer.sample()  # 经验池取数据
        if not self.is_continuous and 'a' in data_name_list:
            a_idx = data_name_list.index('a')
            a = data[a_idx].astype(np.int32)
            pre_shape = a.shape
            a = a.reshape(-1)
            a = sth.int2one_hot(a, self.a_dim)
            a = a.reshape(pre_shape + (-1, ))
            data[a_idx] = a
        return dict([[
            n, d
        ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
Exemple #5
0
class Agent:
    def __init__(self, env_args, model_args, buffer_args, train_args):
        self.env_args = env_args
        self.model_args = model_args
        self.buffer_args = buffer_args
        self.train_args = train_args

        self.model_index = str(self.train_args.get('index'))
        self.all_learner_print = bool(
            self.train_args.get('all_learner_print', False))
        self.train_args['name'] += f'-{self.model_index}'
        if self.model_args['load'] is None:
            self.train_args['load_model_path'] = os.path.join(
                self.train_args['base_dir'], self.train_args['name'])
        else:
            if '/' in self.model_args['load'] or '\\' in self.model_args[
                    'load']:  # 所有训练进程都以该模型路径初始化,绝对路径
                self.train_args['load_model_path'] = self.model_args['load']
            elif '-' in self.model_args['load']:
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'])  # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径
            else:  # 只写load的训练名称,不用带进程序号,会自动补
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'] + f'-{self.model_index}')

        # ENV
        self.env = make_env(self.env_args)

        # ALGORITHM CONFIG
        Model, algorithm_config, _policy_mode = get_model_info(
            self.model_args['algo'])
        self.model_args['policy_mode'] = _policy_mode
        if self.model_args['algo_config'] is not None:
            algorithm_config = UpdateConfig(algorithm_config,
                                            self.model_args['algo_config'],
                                            'algo')
        ShowConfig(algorithm_config)

        # BUFFER
        if _policy_mode == 'off-policy':
            self.buffer_args['batch_size'] = algorithm_config['batch_size']
            self.buffer_args['buffer_size'] = algorithm_config['buffer_size']
            _use_priority = algorithm_config.get('use_priority', False)
            _n_step = algorithm_config.get('n_step', False)
            if _use_priority and _n_step:
                self.buffer_args['type'] = 'NSTEP-PER'
                self.buffer_args['NSTEP-PER']['max_episode'] = self.train_args[
                    'max_episode']
                self.buffer_args['NSTEP-PER']['gamma'] = algorithm_config[
                    'gamma']
            elif _use_priority:
                self.buffer_args['type'] = 'PER'
                self.buffer_args['PER']['max_episode'] = self.train_args[
                    'max_episode']
            elif _n_step:
                self.buffer_args['type'] = 'NSTEP-ER'
                self.buffer_args['NSTEP-ER']['gamma'] = algorithm_config[
                    'gamma']
            else:
                self.buffer_args['type'] = 'ER'
        else:
            self.buffer_args['type'] = 'Pandas'

        # MODEL
        base_dir = os.path.join(
            self.train_args['base_dir'], self.train_args['name']
        )  # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME
        if 'batch_size' in algorithm_config.keys() and train_args['fill_in']:
            self.train_args['no_op_steps'] = algorithm_config['batch_size']
        else:
            self.train_args['no_op_steps'] = train_args['no_op_steps']

        if self.env_args['type'] == 'gym':
            # buffer ------------------------------
            if 'NSTEP' in self.buffer_args['type']:
                self.buffer_args[self.buffer_args['type']][
                    'agents_num'] = self.env_args['env_num']
            self.buffer = get_buffer(self.buffer_args)
            # buffer ------------------------------

            # model -------------------------------
            model_params = {
                's_dim': self.env.s_dim,
                'visual_sources': self.env.visual_sources,
                'visual_resolution': self.env.visual_resolution,
                'a_dim_or_list': self.env.a_dim_or_list,
                'is_continuous': self.env.is_continuous,
                'max_episode': self.train_args['max_episode'],
                'base_dir': base_dir,
                'logger2file': self.model_args['logger2file'],
                'seed': self.model_args['seed']
            }
            self.model = Model(**model_params, **algorithm_config)
            self.model.set_buffer(self.buffer)
            self.model.init_or_restore(
                os.path.join(self.train_args['load_model_path']))
            # model -------------------------------

            self.train_args['begin_episode'] = self.model.get_init_episode()
            if not self.train_args['inference']:
                records_dict = {
                    'env': self.env_args,
                    'model': self.model_args,
                    'buffer': self.buffer_args,
                    'train': self.train_args,
                    'algo': algorithm_config
                }
                save_config(os.path.join(base_dir, 'config'), records_dict)
        else:
            # buffer -----------------------------------
            self.buffer_args_s = []
            for i in range(self.env.brain_num):
                _bargs = deepcopy(self.buffer_args)
                if 'NSTEP' in _bargs['type']:
                    _bargs[_bargs['type']][
                        'agents_num'] = self.env.brain_agents[i]
                self.buffer_args_s.append(_bargs)
            buffers = [
                get_buffer(self.buffer_args_s[i])
                for i in range(self.env.brain_num)
            ]
            # buffer -----------------------------------

            # model ------------------------------------
            self.model_args_s = []
            for i in range(self.env.brain_num):
                _margs = deepcopy(self.model_args)
                _margs['seed'] = self.model_args['seed'] + i * 10
                self.model_args_s.append(_margs)
            model_params = [
                {
                    's_dim': self.env.s_dim[i],
                    'a_dim_or_list': self.env.a_dim_or_list[i],
                    'visual_sources': self.env.visual_sources[i],
                    'visual_resolution': self.env.visual_resolutions[i],
                    'is_continuous': self.env.is_continuous[i],
                    'max_episode': self.train_args['max_episode'],
                    'base_dir': os.path.join(base_dir, b),
                    'logger2file': self.model_args_s[i]['logger2file'],
                    'seed': self.model_args_s[i]
                    ['seed'],  # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                } for i, b in enumerate(self.env.brain_names)
            ]

            # multi agent training------------------------------------
            if self.model_args['algo'][:3] == 'ma_':
                self.ma = True
                assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
                self.ma_data = ExperienceReplay(batch_size=10, capacity=1000)
                [
                    mp.update({
                        'n': self.env.brain_num,
                        'i': i
                    }) for i, mp in enumerate(model_params)
                ]
            else:
                self.ma = False
            # multi agent training------------------------------------

            self.models = [
                Model(**model_params[i], **algorithm_config)
                for i in range(self.env.brain_num)
            ]

            [
                model.set_buffer(buffer)
                for model, buffer in zip(self.models, buffers)
            ]
            [
                self.models[i].init_or_restore(
                    os.path.join(self.train_args['load_model_path'], b))
                for i, b in enumerate(self.env.brain_names)
            ]
            # model ------------------------------------

            self.train_args['begin_episode'] = self.models[0].get_init_episode(
            )
            if not self.train_args['inference']:
                for i, b in enumerate(self.env.brain_names):
                    records_dict = {
                        'env': self.env_args,
                        'model': self.model_args_s[i],
                        'buffer': self.buffer_args_s[i],
                        'train': self.train_args,
                        'algo': algorithm_config
                    }
                    save_config(os.path.join(base_dir, b, 'config'),
                                records_dict)
        pass

    def pwi(self, *args):
        if self.all_learner_print:
            print(f'| Model-{self.model_index} |', *args)
        elif int(self.model_index) == 0:
            print(f'|#ONLY#Model-{self.model_index} |', *args)

    def __call__(self):
        self.train()

    def train(self):
        if self.env_args['type'] == 'gym':
            try:
                self.gym_no_op()
                self.gym_train()
            finally:
                self.model.close()
                self.env.close()
        else:
            try:
                if self.ma:
                    self.ma_unity_no_op()
                    self.ma_unity_train()
                else:
                    self.unity_no_op()
                    self.unity_train()
            finally:
                [model.close() for model in self.models]
                self.env.close()

    def evaluate(self):
        if self.env_args['type'] == 'gym':
            self.gym_inference()
        else:
            if self.ma:
                self.ma_unity_inference()
            else:
                self.unity_inference()

    def init_variables(self):
        """
        inputs:
            env: Environment
        outputs:
            i: specify which item of state should be modified
            state: [vector_obs, visual_obs]
            newstate: [vector_obs, visual_obs]
        """
        i = 1 if self.env.obs_type == 'visual' else 0
        return i, [np.array([[]] * self.env.n),
                   np.array([[]] * self.env.n)], [
                       np.array([[]] * self.env.n),
                       np.array([[]] * self.env.n)
                   ]

    def get_visual_input(self, n, cameras, brain_obs):
        '''
        inputs:
            n: agents number
            cameras: camera number
            brain_obs: observations of specified brain, include visual and vector observation.
        output:
            [vector_information, [visual_info0, visual_info1, visual_info2, ...]]
        '''
        ss = []
        for j in range(n):
            s = []
            for k in range(cameras):
                s.append(brain_obs.visual_observations[k][j])
            ss.append(np.array(s))
        return np.array(ss)

    def gym_train(self):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
            policy_mode:        'on-policy' or 'off-policy'
        """
        begin_episode = int(self.train_args['begin_episode'])
        render = bool(self.train_args['render'])
        render_episode = int(self.train_args.get('render_episode', 50000))
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        eval_while_train = int(self.train_args['eval_while_train'])
        max_eval_episode = int(self.train_args.get('max_eval_episode'))
        policy_mode = str(self.model_args['policy_mode'])

        i, state, new_state = self.init_variables()
        sma = SMA(100)
        for episode in range(begin_episode, max_episode):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            step = 0
            r = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(self.env.n)
                if render or episode > render_episode:
                    self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
                new_state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                self.model.store_data(s=state[0],
                                      visual_s=state[1],
                                      a=action,
                                      r=reward,
                                      s_=new_state[0],
                                      visual_s_=new_state[1],
                                      done=done)

                if policy_mode == 'off-policy':
                    self.model.learn(episode=episode, step=1)
                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    new_state[i][
                        self.env.dones_index] = self.env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            if policy_mode == 'on-policy':
                self.model.learn(episode=episode, step=step)
            self.model.writer_summary(episode,
                                      reward_mean=r.mean(),
                                      reward_min=r.min(),
                                      reward_max=r.max(),
                                      step=last_done_step,
                                      **sma.rs)
            self.pwi('-' * 40)
            self.pwi(
                f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}'
            )
            if episode % save_frequency == 0:
                self.model.save_checkpoint(episode)

            if eval_while_train and self.env.reward_threshold is not None:
                if r.max() >= self.env.reward_threshold:
                    self.pwi(
                        f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                    )
                    self.gym_evaluate()

    def gym_evaluate(self):
        max_step = int(self.train_args['max_step'])
        max_eval_episode = int(self.train_args['max_eval_eposide'])
        i, state, _ = self.init_variables()
        total_r = np.zeros(self.env.n)
        total_steps = np.zeros(self.env.n)
        episodes = max_eval_episode // self.env.n
        for _ in range(episodes):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            steps = np.zeros(self.env.n)
            r = np.zeros(self.env.n)
            while True:
                r_tem = np.zeros(self.env.n)
                action = self.model.choose_action(
                    s=state[0], visual_s=state[1], evaluation=True
                )  # In the future, this method can be combined with choose_action
                state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                steps[unfinished_index] += 1
                r += r_tem
                if all(dones_flag) or any(steps >= max_step):
                    break
            total_r += r
            total_steps += steps
        average_r = total_r.mean() / episodes
        average_step = int(total_steps.mean() / episodes)
        solved = True if average_r >= self.env.reward_threshold else False
        self.pwi(
            f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}'
        )
        self.pwi(
            '----------------------------------------------------------------------------------------------------------------------------'
        )

    def gym_no_op(self):
        steps = self.train_args['no_op_steps']
        choose = self.train_args['no_op_choose']
        assert isinstance(
            steps, int
        ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'

        i, state, new_state = self.init_variables()

        state[i] = self.env.reset()

        steps = steps // self.env.n + 1

        for step in range(steps):
            self.pwi(f'no op step {step}')
            if choose:
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
            else:
                action = self.env.sample_actions()
            new_state[i], reward, done, info = self.env.step(action)
            self.model.no_op_store(s=state[0],
                                   visual_s=state[1],
                                   a=action,
                                   r=reward,
                                   s_=new_state[0],
                                   visual_s_=new_state[1],
                                   done=done)
            if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                new_state[i][self.env.dones_index] = self.env.partial_reset()
            state[i] = new_state[i]

    def gym_inference(self):
        i, state, _ = self.init_variables()
        while True:
            state[i] = self.env.reset()
            while True:
                self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1],
                                                  evaluation=True)
                state[i], reward, done, info = self.env.step(action)
                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    state[i][self.env.dones_index] = self.env.partial_reset()

    def unity_train(self):
        """
        Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
        Inputs:
            env:                    Environment for interaction.
            models:                 all models for this trianing task.
            save_frequency:         how often to save checkpoints.
            reset_config:           configuration to reset for Unity environment.
            max_step:               maximum number of steps for an episode.
            sampler_manager:        sampler configuration parameters for 'reset_config'.
            resampling_interval:    how often to resample parameters for env reset.
        Variables:
            brain_names:    a list of brain names set in Unity.
            state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
            visual_state:   store a list of visual state information for each brain.
            action:         store a list of actions for each brain.
            dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
            agents_num:     use to record 'number' of agents for each brain.
            rewards:        use to record rewards of agents for each brain.
        """
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])

        brains_num = len(self.env.brain_names)
        state = [0] * brains_num
        visual_state = [0] * brains_num
        action = [0] * brains_num
        dones_flag = [0] * brains_num
        agents_num = [0] * brains_num
        rewards = [0] * brains_num
        sma = [SMA(100) for i in range(brains_num)]

        for episode in range(begin_episode, max_episode):
            obs = self.env.reset()
            for i, brain_name in enumerate(self.env.brain_names):
                agents_num[i] = len(obs[brain_name].agents)
                dones_flag[i] = np.zeros(agents_num[i])
                rewards[i] = np.zeros(agents_num[i])
            step = 0
            last_done_step = -1
            while True:
                step += 1
                for i, brain_name in enumerate(self.env.brain_names):
                    state[i] = obs[brain_name].vector_observations
                    visual_state[i] = self.get_visual_input(
                        agents_num[i], self.models[i].visual_sources,
                        obs[brain_name])
                    action[i] = self.models[i].choose_action(
                        s=state[i], visual_s=visual_state[i])
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                obs = self.env.step(vector_action=actions)

                for i, brain_name in enumerate(self.env.brain_names):
                    unfinished_index = np.where(dones_flag[i] == False)[0]
                    dones_flag[i] += obs[brain_name].local_done
                    next_state = obs[brain_name].vector_observations
                    next_visual_state = self.get_visual_input(
                        agents_num[i], self.models[i].visual_sources,
                        obs[brain_name])
                    self.models[i].store_data(
                        s=state[i],
                        visual_s=visual_state[i],
                        a=action[i],
                        r=np.asarray(obs[brain_name].rewards),
                        s_=next_state,
                        visual_s_=next_visual_state,
                        done=np.asarray(obs[brain_name].local_done))
                    rewards[i][unfinished_index] += np.asarray(
                        obs[brain_name].rewards)[unfinished_index]
                    if policy_mode == 'off-policy':
                        self.models[i].learn(episode=episode, step=1)

                if all([all(dones_flag[i]) for i in range(brains_num)]):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

            for i in range(brains_num):
                sma[i].update(rewards[i])
                if policy_mode == 'on-policy':
                    self.models[i].learn(episode=episode, step=step)
                self.models[i].writer_summary(episode,
                                              reward_mean=rewards[i].mean(),
                                              reward_min=rewards[i].min(),
                                              reward_max=rewards[i].max(),
                                              step=last_done_step,
                                              **sma[i].rs)
            self.pwi('-' * 40)
            self.pwi(
                f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
            )
            for i in range(brains_num):
                self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
            if episode % save_frequency == 0:
                for i in range(brains_num):
                    self.models[i].save_checkpoint(episode)

    def unity_no_op(self):
        '''
        Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer.
        Make sure steps is greater than n-step if using any n-step ReplayBuffer.
        '''
        steps = self.train_args['no_op_steps']
        choose = self.train_args['no_op_choose']
        assert isinstance(
            steps, int
        ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'

        brains_num = len(self.env.brain_names)
        state = [0] * brains_num
        visual_state = [0] * brains_num
        agents_num = [0] * brains_num
        action = [0] * brains_num
        obs = self.env.reset()

        for i, brain_name in enumerate(self.env.brain_names):
            # initialize actions to zeros
            agents_num[i] = len(obs[brain_name].agents)
            if self.env.brains[
                    brain_name].vector_action_space_type == 'continuous':
                action[i] = np.zeros(
                    (agents_num[i],
                     self.env.brains[brain_name].vector_action_space_size[0]),
                    dtype=np.int32)
            else:
                action[i] = np.zeros((
                    agents_num[i],
                    len(self.env.brains[brain_name].vector_action_space_size)),
                                     dtype=np.int32)

        steps = steps // min(agents_num) + 1

        for step in range(steps):
            self.pwi(f'no op step {step}')
            for i, brain_name in enumerate(self.env.brain_names):
                state[i] = obs[brain_name].vector_observations
                visual_state[i] = self.get_visual_input(
                    agents_num[i], self.models[i].visual_sources,
                    obs[brain_name])
                if choose:
                    action[i] = self.models[i].choose_action(
                        s=state[i], visual_s=visual_state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            obs = self.env.step(vector_action=actions)
            for i, brain_name in enumerate(self.env.brain_names):
                next_state = obs[brain_name].vector_observations
                next_visual_state = self.get_visual_input(
                    agents_num[i], self.models[i].visual_sources,
                    obs[brain_name])
                self.models[i].no_op_store(
                    s=state[i],
                    visual_s=visual_state[i],
                    a=action[i],
                    r=np.asarray(obs[brain_name].rewards),
                    s_=next_state,
                    visual_s_=next_visual_state,
                    done=np.asarray(obs[brain_name].local_done))

    def unity_inference(self):
        """
        inference mode. algorithm model will not be train, only used to show agents' behavior
        """
        brains_num = len(self.env.brain_names)
        state = [0] * brains_num
        visual_state = [0] * brains_num
        action = [0] * brains_num
        agents_num = [0] * brains_num
        while True:
            obs = self.env.reset()
            for i, brain_name in enumerate(self.env.brain_names):
                agents_num[i] = len(obs[brain_name].agents)
            while True:
                for i, brain_name in enumerate(self.env.brain_names):
                    state[i] = obs[brain_name].vector_observations
                    visual_state[i] = self.get_visual_input(
                        agents_num[i], self.modes[i].visual_sources,
                        obs[brain_name])
                    action[i] = self.modes[i].choose_action(
                        s=state[i], visual_s=visual_state[i], evaluation=True)
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                obs = self.env.step(vector_action=actions)

    def ma_unity_no_op(self):
        steps = self.train_args['no_op_steps']
        choose = self.train_args['no_op_choose']
        assert isinstance(steps,
                          int), 'multi-agent no_op.steps must have type of int'
        if steps < self.ma_data.batch_size:
            steps = self.ma_data.batch_size
        brains_num = len(self.env.brain_names)
        agents_num = [0] * brains_num
        state = [0] * brains_num
        action = [0] * brains_num
        reward = [0] * brains_num
        next_state = [0] * brains_num
        dones = [0] * brains_num
        obs = self.env.reset(train_mode=False)

        for i, brain_name in enumerate(self.env.brain_names):
            agents_num[i] = len(obs[brain_name].agents)
            if self.env.brains[
                    brain_name].vector_action_space_type == 'continuous':
                action[i] = np.zeros(
                    (agents_num[i],
                     self.env.brains[brain_name].vector_action_space_size[0]),
                    dtype=np.int32)
            else:
                action[i] = np.zeros((
                    agents_num[i],
                    len(self.env.brains[brain_name].vector_action_space_size)),
                                     dtype=np.int32)

        a = [np.asarray(e) for e in zip(*action)]
        for step in range(steps):
            print(f'no op step {step}')
            for i, brain_name in enumerate(self.env.brain_names):
                state[i] = obs[brain_name].vector_observations
                if choose:
                    action[i] = self.models[i].choose_action(s=state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            obs = self.env.step(vector_action=actions)
            for i, brain_name in enumerate(self.env.brain_names):
                reward[i] = np.asarray(obs[brain_name].rewards)[:, np.newaxis]
                next_state[i] = obs[brain_name].vector_observations
                dones[i] = np.asarray(obs[brain_name].local_done)[:,
                                                                  np.newaxis]
            s = [np.asarray(e) for e in zip(*state)]
            a = [np.asarray(e) for e in zip(*action)]
            r = [np.asarray(e) for e in zip(*reward)]
            s_ = [np.asarray(e) for e in zip(*next_state)]
            done = [np.asarray(e) for e in zip(*dones)]
            self.ma_data.add(s, a, r, s_, done)

    def ma_unity_train(self):
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only."
        brains_num = len(self.env.brain_names)
        batch_size = self.ma_data.batch_size
        agents_num = [0] * brains_num
        state = [0] * brains_num
        action = [0] * brains_num
        new_action = [0] * brains_num
        next_action = [0] * brains_num
        reward = [0] * brains_num
        next_state = [0] * brains_num
        dones = [0] * brains_num

        dones_flag = [0] * brains_num
        rewards = [0] * brains_num

        for episode in range(begin_episode, max_episode):
            obs = self.env.reset()
            for i, brain_name in enumerate(self.env.brain_names):
                agents_num[i] = len(obs[brain_name].agents)
                dones_flag[i] = np.zeros(agents_num[i])
                rewards[i] = np.zeros(agents_num[i])
            step = 0
            last_done_step = -1
            while True:
                step += 1
                for i, brain_name in enumerate(self.env.brain_names):
                    state[i] = obs[brain_name].vector_observations
                    action[i] = self.models[i].choose_action(s=state[i])
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                obs = self.env.step(vector_action=actions)

                for i, brain_name in enumerate(self.env.brain_names):
                    reward[i] = np.asarray(obs[brain_name].rewards)[:,
                                                                    np.newaxis]
                    next_state[i] = obs[brain_name].vector_observations
                    dones[i] = np.asarray(
                        obs[brain_name].local_done)[:, np.newaxis]
                    unfinished_index = np.where(dones_flag[i] == False)[0]
                    dones_flag[i] += obs[brain_name].local_done
                    rewards[i][unfinished_index] += np.asarray(
                        obs[brain_name].rewards)[unfinished_index]

                s = [np.asarray(e) for e in zip(*state)]
                a = [np.asarray(e) for e in zip(*action)]
                r = [np.asarray(e) for e in zip(*reward)]
                s_ = [np.asarray(e) for e in zip(*next_state)]
                done = [np.asarray(e) for e in zip(*dones)]
                self.ma_data.add(s, a, r, s_, done)
                s, a, r, s_, done = self.ma_data.sample()
                for i, brain_name in enumerate(self.env.brain_names):
                    next_action[i] = self.models[i].get_target_action(s=s_[:,
                                                                           i])
                    new_action[i] = self.models[i].choose_action(
                        s=s[:, i], evaluation=True)
                a_ = np.asarray([np.asarray(e) for e in zip(*next_action)])
                if policy_mode == 'off-policy':
                    for i in range(brains_num):
                        self.models[i].learn(
                            episode=episode,
                            ap=np.asarray([
                                np.asarray(e) for e in zip(*next_action[:i])
                            ]).reshape(batch_size, -1) if i != 0 else np.zeros(
                                (batch_size, 0)),
                            al=np.asarray([
                                np.asarray(e) for e in zip(
                                    *next_action[-(brains_num - i - 1):])
                            ]).reshape(batch_size, -1)
                            if brains_num - i != 1 else np.zeros(
                                (batch_size, 0)),
                            ss=s.reshape(batch_size, -1),
                            ss_=s_.reshape(batch_size, -1),
                            aa=a.reshape(batch_size, -1),
                            aa_=a_.reshape(batch_size, -1),
                            s=s[:, i],
                            r=r[:, i])

                if all([all(dones_flag[i]) for i in range(brains_num)]):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

            # if train_mode == 'perEpisode':
            #     for i in range(brains_num):
            #         self.models[i].learn(episode)

            for i in range(brains_num):
                self.models[i].writer_summary(episode,
                                              total_reward=rewards[i].mean(),
                                              step=last_done_step)
            self.pwi('-' * 40)
            self.pwi(
                f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}'
            )
            if episode % save_frequency == 0:
                for i in range(brains_num):
                    self.models[i].save_checkpoint(episode)

    def ma_unity_inference(self):
        """
        inference mode. algorithm model will not be train, only used to show agents' behavior
        """
        brains_num = len(self.env.brain_names)
        state = [0] * brains_num
        action = [0] * brains_num
        while True:
            obs = self.env.reset()
            while True:
                for i, brain_name in enumerate(self.env.brain_names):
                    state[i] = obs[brain_name].vector_observations
                    action[i] = self.models[i].choose_action(s=state[i],
                                                             evaluation=True)
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                obs = self.env.step(vector_action=actions)
Exemple #6
0
class Off_Policy(Policy):
    def __init__(self, s_dim, visual_sources, visual_resolution, a_dim_or_list,
                 is_continuous, **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim_or_list=a_dim_or_list,
                         is_continuous=is_continuous,
                         **kwargs)
        self.batch_size = int(kwargs.get('batch_size', 128))
        self.buffer_size = int(kwargs.get('buffer_size', 10000))
        self.use_priority = kwargs.get('use_priority', False)
        self.n_step = kwargs.get('n_step', False)
        self.init_data_memory()

    def init_data_memory(self):
        if self.use_priority:
            if self.n_step:
                print('N-Step PER')
                self.data = NStepPrioritizedExperienceReplay(
                    self.batch_size,
                    self.buffer_size,
                    max_episode=self.max_episode,
                    gamma=self.gamma,
                    alpha=er_config['nper_config']['alpha'],
                    beta=er_config['nper_config']['beta'],
                    epsilon=er_config['nper_config']['epsilon'],
                    agents_num=er_config['nper_config']['max_agents'],
                    n=er_config['nper_config']['n'],
                    global_v=er_config['nper_config']['global_v'])
            else:
                print('PER')
                self.data = PrioritizedExperienceReplay(
                    self.batch_size,
                    self.buffer_size,
                    max_episode=self.max_episode,
                    alpha=er_config['per_config']['alpha'],
                    beta=er_config['per_config']['beta'],
                    epsilon=er_config['per_config']['epsilon'],
                    global_v=er_config['nper_config']['global_v'])
        else:
            if self.n_step:
                print('N-Step ER')
                self.data = NStepExperienceReplay(
                    self.batch_size,
                    self.buffer_size,
                    gamma=self.gamma,
                    agents_num=er_config['ner_config']['max_agents'],
                    n=er_config['ner_config']['n'])
            else:
                print('ER')
                self.data = ExperienceReplay(self.batch_size, self.buffer_size)

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        if not self.is_continuous:
            a = sth.action_index2one_hot(a, self.a_dim_or_list)
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis])

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "no_op_store need done type is np.ndarray"
        if not self.is_continuous:
            a = sth.action_index2one_hot(a, self.a_dim_or_list)
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis])
Exemple #7
0
class Policy(Base):
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim_or_list,
                 action_type,
                 gamma,
                 max_episode,
                 base_dir,
                 policy_mode=None,
                 batch_size=1,
                 buffer_size=1,
                 use_priority=False,
                 n_step=False):
        super().__init__(a_dim_or_list, action_type, base_dir)
        self.s_dim = s_dim
        self.visual_sources = visual_sources
        self.visual_dim = [visual_sources, *visual_resolution
                           ] if visual_sources else [0]
        self.a_dim_or_list = a_dim_or_list
        self.gamma = gamma
        self.max_episode = max_episode
        self.policy_mode = policy_mode
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        '''
        the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension
        of 'r' and 'done'.
        'ON' mode means program will call on_store function and use pandas dataframe to store data.
        'OFF' mode will call off_store function and use replay buffer to store data.
        '''
        if self.policy_mode == 'ON':
            self.data = pd.DataFrame(columns=['s', 'a', 'r', 's_', 'done'])
        elif self.policy_mode == 'OFF':
            if use_priority:
                if n_step:
                    print('N-Step PER')
                    self.data = NStepPrioritizedExperienceReplay(
                        self.batch_size,
                        self.buffer_size,
                        max_episode=self.max_episode,
                        gamma=self.gamma,
                        alpha=0.6,
                        beta=0.2,
                        epsilon=0.01,
                        agents_num=20,
                        n=4)
                else:
                    print('PER')
                    self.data = PrioritizedExperienceReplay(
                        self.batch_size,
                        self.buffer_size,
                        max_episode=self.max_episode,
                        alpha=0.6,
                        beta=0.2,
                        epsilon=0.01)
            else:
                if n_step:
                    print('N-Step ER')
                    self.data = NStepExperienceReplay(self.batch_size,
                                                      self.buffer_size,
                                                      gamma=self.gamma,
                                                      agents_num=20,
                                                      n=4)
                else:
                    print('ER')
                    self.data = ExperienceReplay(self.batch_size,
                                                 self.buffer_size)
        else:
            raise Exception('Please specific a mode of policy!')

    def on_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas.
        """
        assert isinstance(
            a, np.ndarray), "on_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "on_store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "on_store need done type is np.ndarray"
        self.data = self.data.append(
            {
                's': s,
                'visual_s': visual_s,
                'a': a,
                'r': r,
                's_': s_,
                'visual_s_': visual_s_,
                'done': done
            },
            ignore_index=True)

    def off_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(
            a, np.ndarray), "off_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "off_store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "off_store need done type is np.ndarray"
        self.data.add(s, visual_s, a, r, s_, visual_s_, done)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "no_op_store need done type is np.ndarray"
        if self.policy_mode == 'OFF':
            self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                          done[:, np.newaxis])

    def clear(self):
        """
        clear the DataFrame.
        """
        self.data.drop(self.data.index, inplace=True)

    def get_max_episode(self):
        """
        get the max episode of this training model.
        """
        return self.max_episode
Exemple #8
0
class Policy(Base):
    def __init__(self,
                 a_dim_or_list,
                 action_type,
                 base_dir,

                 s_dim,
                 visual_sources,
                 visual_resolution,
                 gamma,
                 max_episode,
                 policy_mode=None,
                 batch_size=1,
                 buffer_size=1,
                 use_priority=False,
                 n_step=False):
        super().__init__(
            a_dim_or_list=a_dim_or_list,
            action_type=action_type,
            base_dir=base_dir)
        self.s_dim = s_dim
        self.visual_sources = visual_sources
        self.visual_dim = [visual_sources, *visual_resolution] if visual_sources else [0]
        self.a_dim_or_list = a_dim_or_list
        self.gamma = gamma
        self.max_episode = max_episode
        self.policy_mode = policy_mode
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.use_priority = use_priority
        self.n_step = n_step
        self.init_data_memory()

    def init_data_memory(self):
        '''
        the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension
        of 'r' and 'done'.
        'ON' mode means program will call on_store function and use pandas dataframe to store data.
        'OFF' mode will call off_store function and use replay buffer to store data.
        '''
        if self.policy_mode == 'ON':
            self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done'])
        elif self.policy_mode == 'OFF':
            if self.use_priority:
                if self.n_step:
                    print('N-Step PER')
                    self.data = NStepPrioritizedExperienceReplay(self.batch_size,
                                                                 self.buffer_size,
                                                                 max_episode=self.max_episode,
                                                                 gamma=self.gamma,
                                                                 alpha=er_config['nper_config']['alpha'],
                                                                 beta=er_config['nper_config']['beta'],
                                                                 epsilon=er_config['nper_config']['epsilon'],
                                                                 agents_num=er_config['nper_config']['max_agents'],
                                                                 n=er_config['nper_config']['n'],
                                                                 global_v=er_config['nper_config']['global_v'])
                else:
                    print('PER')
                    self.data = PrioritizedExperienceReplay(self.batch_size,
                                                            self.buffer_size,
                                                            max_episode=self.max_episode,
                                                            alpha=er_config['per_config']['alpha'],
                                                            beta=er_config['per_config']['beta'],
                                                            epsilon=er_config['per_config']['epsilon'],
                                                            global_v=er_config['nper_config']['global_v'])
            else:
                if self.n_step:
                    print('N-Step ER')
                    self.data = NStepExperienceReplay(self.batch_size,
                                                      self.buffer_size,
                                                      gamma=self.gamma,
                                                      agents_num=er_config['ner_config']['max_agents'],
                                                      n=er_config['ner_config']['n'])
                else:
                    print('ER')
                    self.data = ExperienceReplay(self.batch_size, self.buffer_size)
        else:
            raise Exception('Please specific a mode of policy!')

    def on_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas.
        """
        assert isinstance(a, np.ndarray), "on_store need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "on_store need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray"
        if not self.action_type == 'continuous':
            a = sth.action_index2one_hot(a, self.a_dim_or_list)
        self.data = self.data.append({
            's': s.astype(np.float32),
            'visual_s': visual_s.astype(np.float32),
            'a': a.astype(np.float32),
            'r': r.astype(np.float32),
            's_': s_.astype(np.float32),
            'visual_s_': visual_s_.astype(np.float32),
            'done': done.astype(np.float32)
        }, ignore_index=True)

    def off_store(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a, np.ndarray), "off_store need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "off_store need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray"
        if not self.action_type == 'continuous':
            a = sth.action_index2one_hot(a, self.a_dim_or_list)
        self.data.add(
            s.astype(np.float32),
            visual_s.astype(np.float32),
            a.astype(np.float32),
            r.astype(np.float32),
            s_.astype(np.float32),
            visual_s_.astype(np.float32),
            done.astype(np.float32)
        )

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray"
        if self.policy_mode == 'OFF':
            if not self.action_type == 'continuous':
                a = sth.action_index2one_hot(a, self.a_dim_or_list)
            self.data.add(
                s.astype(np.float32),
                visual_s.astype(np.float32),
                a.astype(np.float32),
                r[:, np.newaxis].astype(np.float32),
                s_.astype(np.float32),
                visual_s_.astype(np.float32),
                done[:, np.newaxis].astype(np.float32)
            )

    def clear(self):
        """
        clear the DataFrame.
        """
        self.data.drop(self.data.index, inplace=True)

    def get_max_episode(self):
        """
        get the max episode of this training model.
        """
        return self.max_episode

    def get_TensorSpecs(self, *args):
        """
        get all inputs' shape in order to fix the problem of retracting in TF2.0
        """
        return [tf.TensorSpec(shape=[None] + i, dtype=tf.float32) for i in args]

    @staticmethod
    def clip_nn_log_std(log_std, _min=-20, _max=2):
        """
        scale log_std from [-1, 1] to [_min, _max]
        """
        return _min + 0.5 * (_max - _min) * (log_std + 1)

    @staticmethod
    def gaussian_reparam_sample(mu, log_std):
        """
        reparameter
        """
        std = tf.exp(log_std)
        pi = mu + tf.random.normal(mu.shape) * std
        log_pi = Policy.gaussian_likelihood(pi, mu, log_std)
        return pi, log_pi

    @staticmethod
    def gaussian_likelihood(x, mu, log_std):
        pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + 1e-8))**2 + 2 * log_std + np.log(2 * np.pi))
        return tf.reduce_sum(pre_sum, axis=1, keepdims=True)

    @staticmethod
    def gaussian_entropy(log_std):
        return tf.reduce_mean(0.5 * (1 + tf.math.log(2 * np.pi * tf.exp(log_std)**2)))

    @staticmethod
    def squash_action(pi, log_pi=None):
        """
        enforcing action bounds.
        squash action to range [-1, 1] and calculate the correct log probability value 
        """
        pi = tf.tanh(pi)
        if log_pi is not None:
            sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True)
            log_pi -= sub
        return pi, log_pi

    @staticmethod
    def unsquash_action(mu, pi, log_std):
        """
        desquash action from [-1, 1] to [-inf, inf]
        """
        _pi = tf.atanh(pi)
        log_pi = Policy.gaussian_likelihood(_pi, mu, log_std)
        sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True)
        log_pi -= sub
        return log_pi

    @staticmethod
    def clip_but_pass_gradient(x, l=-1., h=1.):
        """
        Stole this function from SpinningUp
        """
        clip_up = tf.cast(x > h, tf.float32)
        clip_low = tf.cast(x < l, tf.float32)
        return x + tf.stop_gradient((h - x) * clip_up + (l - x) * clip_low)
Exemple #9
0
class MyPolicy(RL_Policy):
    """
    实现自己的智能体策略
    """

    def __init__(self, dim, name='wjs_policy'):
        super().__init__(dim, name)
        
        self.state_dim = dim * dim * 3
        self.gamma = 0.99
        self.lr = 0.0005
        self.data = ExperienceReplay(batch_size = 100, capacity=10000)

        with self.graph.as_default():
            self.pl_s = tf.placeholder(tf.float32, [None, self.state_dim], 'state')
            self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward')
            self.pl_s_ = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state')
            self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done')
            
            self.v = self.v_net('v', self.pl_s)
            self.action = tf.argmax(self.v)
            self.v_ = self.v_net('v', self.pl_s_)
            self.predict = tf.stop_gradient(self.pl_r + self.gamma * self.v_ * (1 - self.pl_done))

            self.v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.predict))
            self.v_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='v')
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_v = optimizer.minimize(self.v_loss, var_list=self.v_vars, global_step=self.global_step)

            tf.summary.scalar('LOSS/v_loss', self.v_loss)
            self.summaries = tf.summary.merge_all()

            self.sess.run(tf.global_variables_initializer())
    
    def update_offset(self, offset):
        assert type(offset) == int
        self.offset = offset

    def v_net(self, name, input_vector):
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            l1 = tf.layers.dense(input_vector, 128, tf.nn.relu, **initKernelAndBias)
            l2 = tf.layers.dense(l1, 64, tf.nn.relu, **initKernelAndBias)
            l3 = tf.layers.dense(l2, 32, tf.nn.relu, **initKernelAndBias)
            v = tf.layers.dense(l3, 1, None, **initKernelAndBias)
            return v

    def store(self, **kargs):
        self.data.add(*kargs.values())

    def choose_action(self, state):
        indexs, all_states = self.get_all_available_actions(state)
        if np.random.rand() > 0.2:
            action = self.sess.run(self.action, feed_dict={
                self.pl_s: all_states
            })[0]
        else:
            action = np.random.randint(len(indexs))
        x, y = indexs[action] % self.dim, indexs[action] // self.dim
        return x, y

    def learn(self):
        try:
            s, r, s_, done = self.data.sample()
            summaries, _ = self.sess.run([self.summaries, self.train_v], feed_dict={
                self.pl_s: np.eye(3)[s].reshape(s.shape[0],-1),
                self.pl_r: r[:, np.newaxis],
                self.pl_s_: np.eye(3)[s_].reshape(s.shape[0],-1),
                self.pl_done: done[:, np.newaxis]
            })
            self.writer.add_summary(summaries, self.sess.run(self.global_step))
        except Exception as e:
            print(e)
            return

    def get_all_available_actions(self, state):
        assert isinstance(state, np.ndarray), "state不是numpy类型"
        indexs = []
        for i in range(state.shape[0]):
            if state[i] == 2:
                indexs.append(i)
        all_states = []
        for i in indexs:
            a = np.zeros_like(state)
            a[i] = self.offset
            all_states.append(state - a)
        return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])
Exemple #10
0
class MyPolicy(RL_Policy):
    """
    实现自己的智能体策略
    """
    def __init__(self, dim, name='wjs_policy'):
        super().__init__(dim, name)

        self.state_dim = dim * dim * 3
        self.gamma = 0.99
        self.lr = 0.0005
        self.data = ExperienceReplay(batch_size=100, capacity=10000)
        self.v_net = V(vector_dim=self.state_dim,
                       name='v_net',
                       hidden_units=[128, 64, 32])
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)

    def update_offset(self, offset):
        assert isinstance(offset, int)
        self.offset = offset

    def store(self, **kargs):
        self.data.add(*kargs.values())

    @tf.function
    def _get_action(self, state):
        return tf.argmax(self.v_net(state))

    def choose_action(self, state):
        indexs, all_states = self.get_all_available_actions(state)
        if np.random.rand() > 0.2:
            action = self._get_action(all_states)[0]
        else:
            action = np.random.randint(len(indexs))
        x, y = indexs[action] % self.dim, indexs[action] // self.dim
        return x, y

    def learn(self):
        try:
            s, r, s_, done = self.data.sample()
            s = np.eye(3)[s].reshape(s.shape[0], -1)
            r = r[:, np.newaxis]
            s_ = np.eye(3)[s_].reshape(s.shape[0], -1)
            done = done[:, np.newaxis]
            summaries = self.train(s, r, s_, done)
            tf.summary.experimental.set_step(self.global_step)
            self.write_training_summaries(summaries)
            tf.summary.scalar('LEARNING_RATE/lr', self.lr)
            self.writer.flush()
        except Exception as e:
            print(e)
            return

    @tf.function
    def train(self, s, r, s_, done):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                v = self.v_net(s)
                v_ = self.v_net(s_)
                predict = tf.stop_gradient(r + self.gamma * v_ * (1 - done))
                v_loss = tf.reduce_mean((v - predict)**2)
            grads = tape.gradient(v_loss, self.v_net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(grads, self.v_net.trainable_variables))
            self.global_step.assign_add(1)
            return dict([['LOSS/v_loss', v_loss]])

    def get_all_available_actions(self, state):
        assert isinstance(state, np.ndarray), "state不是numpy类型"
        indexs = []
        for i in range(state.shape[0]):
            if state[i] == 2:
                indexs.append(i)
        all_states = []
        for i in indexs:
            a = np.zeros_like(state)
            a[i] = self.offset
            all_states.append(state - a)
        return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])