Esempio n. 1
0
    def __init__(self, dim, name='wjs_policy'):
        super().__init__(dim, name)
        
        self.state_dim = dim * dim * 3
        self.gamma = 0.99
        self.lr = 0.0005
        self.data = ExperienceReplay(batch_size = 100, capacity=10000)

        with self.graph.as_default():
            self.pl_s = tf.placeholder(tf.float32, [None, self.state_dim], 'state')
            self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward')
            self.pl_s_ = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state')
            self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done')
            
            self.v = self.v_net('v', self.pl_s)
            self.action = tf.argmax(self.v)
            self.v_ = self.v_net('v', self.pl_s_)
            self.predict = tf.stop_gradient(self.pl_r + self.gamma * self.v_ * (1 - self.pl_done))

            self.v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.predict))
            self.v_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='v')
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_v = optimizer.minimize(self.v_loss, var_list=self.v_vars, global_step=self.global_step)

            tf.summary.scalar('LOSS/v_loss', self.v_loss)
            self.summaries = tf.summary.merge_all()

            self.sess.run(tf.global_variables_initializer())
Esempio n. 2
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim_or_list,
                 action_type,
                 gamma,
                 max_episode,
                 base_dir,
                 policy_mode=None,
                 batch_size=1,
                 buffer_size=1,
                 use_priority=False,
                 n_step=False):
        super().__init__(a_dim_or_list, action_type, base_dir)
        self.s_dim = s_dim
        self.visual_sources = visual_sources
        self.visual_dim = [visual_sources, *visual_resolution] if visual_sources else [0]
        self.a_dim_or_list = a_dim_or_list
        self.gamma = gamma
        self.max_episode = max_episode
        self.policy_mode = policy_mode
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        '''
        the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension
        of 'r' and 'done'.
        'ON' mode means program will call on_store function and use pandas dataframe to store data.
        'OFF' mode will call off_store function and use replay buffer to store data.
        '''
        if self.policy_mode == 'ON':
            self.data = pd.DataFrame(columns=['s', 'a', 'r', 's_', 'done'])
        elif self.policy_mode == 'OFF':
            if use_priority:
                if n_step:
                    print('N-Step PER')
                    self.data = NStepPrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode,
                                                                 gamma=self.gamma, alpha=0.6, beta=0.2, epsilon=0.01, agents_num=20, n=4)
                else:
                    print('PER')
                    self.data = PrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=0.6, beta=0.2, epsilon=0.01)
            else:
                if n_step:
                    print('N-Step ER')
                    self.data = NStepExperienceReplay(self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=20, n=4)
                else:
                    print('ER')
                    self.data = ExperienceReplay(self.batch_size, self.buffer_size)
        else:
            raise Exception('Please specific a mode of policy!')

        with self.graph.as_default():
            self.pl_s = tf.placeholder(tf.float32, [None, self.s_dim], 'vector_observation')
            self.pl_a = tf.placeholder(tf.float32, [None, self.a_counts], 'pl_action')
            self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward')
            self.pl_s_ = tf.placeholder(tf.float32, [None, self.s_dim], 'next_state')
            self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done')
            self.pl_visual_s = tf.placeholder(tf.float32, [None] + self.visual_dim, 'visual_observation_')
            self.pl_visual_s_ = tf.placeholder(tf.float32, [None] + self.visual_dim, 'next_visual_observation')
Esempio n. 3
0
    def __init__(self, dim, name='wjs_policy'):
        super().__init__(dim, name)

        self.state_dim = dim * dim * 3
        self.gamma = 0.99
        self.lr = 0.0005
        self.data = ExperienceReplay(batch_size=100, capacity=10000)
        self.v_net = V(vector_dim=self.state_dim,
                       name='v_net',
                       hidden_units=[128, 64, 32])
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
Esempio n. 4
0
 def init_data_memory(self):
     '''
     the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension
     of 'r' and 'done'.
     'ON' mode means program will call on_store function and use pandas dataframe to store data.
     'OFF' mode will call off_store function and use replay buffer to store data.
     '''
     if self.policy_mode == 'ON':
         self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done'])
     elif self.policy_mode == 'OFF':
         if self.use_priority:
             if self.n_step:
                 print('N-Step PER')
                 self.data = NStepPrioritizedExperienceReplay(
                     self.batch_size,
                     self.buffer_size,
                     max_episode=self.max_episode,
                     gamma=self.gamma,
                     alpha=er_config['nper_config']['alpha'],
                     beta=er_config['nper_config']['beta'],
                     epsilon=er_config['nper_config']['epsilon'],
                     agents_num=er_config['nper_config']['max_agents'],
                     n=er_config['nper_config']['n'],
                     global_v=er_config['nper_config']['global_v'])
             else:
                 print('PER')
                 self.data = PrioritizedExperienceReplay(
                     self.batch_size,
                     self.buffer_size,
                     max_episode=self.max_episode,
                     alpha=er_config['per_config']['alpha'],
                     beta=er_config['per_config']['beta'],
                     epsilon=er_config['per_config']['epsilon'],
                     global_v=er_config['nper_config']['global_v'])
         else:
             if self.n_step:
                 print('N-Step ER')
                 self.data = NStepExperienceReplay(
                     self.batch_size,
                     self.buffer_size,
                     gamma=self.gamma,
                     agents_num=er_config['ner_config']['max_agents'],
                     n=er_config['ner_config']['n'])
             else:
                 print('ER')
                 self.data = ExperienceReplay(self.batch_size,
                                              self.buffer_size)
     else:
         raise Exception('Please specific a mode of policy!')
Esempio n. 5
0
 def __init__(self,
              state_dim,
              learning_rate=5.0e-4,
              buffer_size=10000,
              batch_size=128,
              epochs=2,
              name='wjs_policy',
              cp_dir='./models'):
     super().__init__(cp_dir=cp_dir)
     self.lr = learning_rate
     self.epochs = epochs
     self.data = ExperienceReplay(batch_size=batch_size,
                                  capacity=buffer_size)
     self.net = PV(state_dim=state_dim, name='pv_net')
     self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
Esempio n. 6
0
 def init_data_memory(self):
     if self.use_priority:
         if self.n_step:
             print('N-Step PER')
             self.data = NStepPrioritizedExperienceReplay(
                 self.batch_size,
                 self.buffer_size,
                 max_episode=self.max_episode,
                 gamma=self.gamma,
                 alpha=er_config['nper_config']['alpha'],
                 beta=er_config['nper_config']['beta'],
                 epsilon=er_config['nper_config']['epsilon'],
                 agents_num=er_config['nper_config']['max_agents'],
                 n=er_config['nper_config']['n'],
                 global_v=er_config['nper_config']['global_v'])
         else:
             print('PER')
             self.data = PrioritizedExperienceReplay(
                 self.batch_size,
                 self.buffer_size,
                 max_episode=self.max_episode,
                 alpha=er_config['per_config']['alpha'],
                 beta=er_config['per_config']['beta'],
                 epsilon=er_config['per_config']['epsilon'],
                 global_v=er_config['nper_config']['global_v'])
     else:
         if self.n_step:
             print('N-Step ER')
             self.data = NStepExperienceReplay(
                 self.batch_size,
                 self.buffer_size,
                 gamma=self.gamma,
                 agents_num=er_config['ner_config']['max_agents'],
                 n=er_config['ner_config']['n'])
         else:
             print('ER')
             self.data = ExperienceReplay(self.batch_size, self.buffer_size)
Esempio n. 7
0
    def __init__(self, env_args: Config, model_args: Config,
                 buffer_args: Config, train_args: Config):
        # print("89898989")
        self.env_args = env_args
        self.model_args = model_args
        self.buffer_args = buffer_args
        self.train_args = train_args
        self.use_GCN = False
        self.model_index = str(self.train_args.get('index'))
        self.all_learner_print = bool(
            self.train_args.get('all_learner_print', False))
        if '-' not in self.train_args['name']:
            self.train_args['name'] += f'-{self.model_index}'
        if self.model_args['load'] is None:
            self.train_args['load_model_path'] = os.path.join(
                self.train_args['base_dir'], self.train_args['name'])
        else:
            if '/' in self.model_args['load'] or '\\' in self.model_args[
                    'load']:  # 所有训练进程都以该模型路径初始化,绝对路径
                self.train_args['load_model_path'] = self.model_args['load']
            elif '-' in self.model_args['load']:
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'])  # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径
            else:  # 只写load的训练名称,不用带进程序号,会自动补
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'] + f'-{self.model_index}')

        # ENV

        self.env = make_env(self.env_args.to_dict, self.use_GCN)

        # ALGORITHM CONFIG
        Model, algorithm_config, _policy_mode = get_model_info(
            self.model_args['algo'])

        self.model_args['policy_mode'] = _policy_mode
        if self.model_args['algo_config'] is not None:
            algorithm_config = UpdateConfig(algorithm_config,
                                            self.model_args['algo_config'],
                                            'algo')
        ShowConfig(algorithm_config)

        # BUFFER
        if _policy_mode == 'off-policy':
            self.buffer_args['batch_size'] = algorithm_config['batch_size']
            self.buffer_args['buffer_size'] = algorithm_config['buffer_size']
            if self.model_args['algo'] in ['drqn', 'drdqn']:
                self.buffer_args['type'] = 'EpisodeER'
            else:
                _use_priority = algorithm_config.get('use_priority', False)
                _n_step = algorithm_config.get('n_step', False)
                if _use_priority and _n_step:
                    self.buffer_args['type'] = 'NstepPER'
                    self.buffer_args['NstepPER'][
                        'max_episode'] = self.train_args['max_episode']
                    self.buffer_args['NstepPER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'], self.buffer_args['NstepPER']
                        ['n'])  # update gamma for n-step training.
                elif _use_priority:
                    self.buffer_args['type'] = 'PER'
                    self.buffer_args['PER']['max_episode'] = self.train_args[
                        'max_episode']
                elif _n_step:
                    self.buffer_args['type'] = 'NstepER'
                    self.buffer_args['NstepER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'],
                        self.buffer_args['NstepER']['n'])
                else:
                    self.buffer_args['type'] = 'ER'
        else:
            self.buffer_args['type'] = 'Pandas'

        # MODEL
        base_dir = os.path.join(
            self.train_args['base_dir'], self.train_args['name']
        )  # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME
        if 'batch_size' in algorithm_config.keys() and train_args['fill_in']:
            self.train_args['pre_fill_steps'] = algorithm_config['batch_size']

        if self.env_args['type'] == 'gym':
            self.eval_env_args = deepcopy(self.env_args)
            self.eval_env_args.env_num = 1
            self.eval_env = make_env(self.eval_env_args.to_dict)
            # buffer ------------------------------
            if 'Nstep' in self.buffer_args[
                    'type'] or 'Episode' in self.buffer_args['type']:
                self.buffer_args[self.buffer_args['type']][
                    'agents_num'] = self.env_args['env_num']
            self.buffer = get_buffer(self.buffer_args)
            # buffer ------------------------------

            # model -------------------------------
            model_params = {
                's_dim': self.env.s_dim,
                'visual_sources': self.env.visual_sources,
                'visual_resolution': self.env.visual_resolution,
                'a_dim_or_list': self.env.a_dim_or_list,
                'is_continuous': self.env.is_continuous,
                'max_episode': self.train_args.max_episode,
                'base_dir': base_dir,
                'logger2file': self.model_args.logger2file,
                'seed': self.model_args.seed
            }
            self.model = Model(**model_params, **algorithm_config)
            self.model.set_buffer(self.buffer)
            self.model.init_or_restore(self.train_args['load_model_path'])
            # model -------------------------------

            self.train_args['begin_episode'] = self.model.get_init_episode()
            if not self.train_args['inference']:
                records_dict = {
                    'env': self.env_args.to_dict,
                    'model': self.model_args.to_dict,
                    'buffer': self.buffer_args.to_dict,
                    'train': self.train_args.to_dict,
                    'algo': algorithm_config
                }
                save_config(os.path.join(base_dir, 'config'), records_dict)
        else:
            # buffer -----------------------------------
            self.buffer_args_s = []
            for i in range(self.env.brain_num):
                _bargs = deepcopy(self.buffer_args)
                if 'Nstep' in _bargs['type'] or 'Episode' in _bargs['type']:
                    _bargs[_bargs['type']][
                        'agents_num'] = self.env.brain_agents[i]
                self.buffer_args_s.append(_bargs)
            buffers = [
                get_buffer(self.buffer_args_s[i])
                for i in range(self.env.brain_num)
            ]
            # buffer -----------------------------------

            # model ------------------------------------
            self.model_args_s = []
            for i in range(self.env.brain_num):
                _margs = deepcopy(self.model_args)
                _margs['seed'] = self.model_args['seed'] + i * 10
                self.model_args_s.append(_margs)
            model_params = [
                {
                    's_dim': self.env.s_dim[i],
                    'a_dim_or_list': self.env.a_dim_or_list[i],
                    'visual_sources': self.env.visual_sources[i],
                    'visual_resolution': self.env.visual_resolutions[i],
                    'is_continuous': self.env.is_continuous[i],
                    'max_episode': self.train_args.max_episode,
                    'base_dir': os.path.join(base_dir, b),
                    'logger2file': self.model_args_s[i].logger2file,
                    'seed': self.model_args_s[i].
                    seed,  # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                } for i, b in enumerate(self.env.brain_names)
            ]

            # multi agent training------------------------------------
            if self.model_args['algo'][:3] == 'ma_':
                self.ma = True
                assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
                self.ma_data = ExperienceReplay(batch_size=10, capacity=1000)
                [
                    mp.update({
                        'n': self.env.brain_num,
                        'i': i
                    }) for i, mp in enumerate(model_params)
                ]
            else:
                self.ma = False
            # multi agent training------------------------------------

            self.models = [
                Model(**model_params[i], **algorithm_config)
                for i in range(self.env.brain_num)
            ]

            [
                model.set_buffer(buffer)
                for model, buffer in zip(self.models, buffers)
            ]
            [
                self.models[i].init_or_restore(
                    os.path.join(self.train_args['load_model_path'], b))
                for i, b in enumerate(self.env.brain_names)
            ]
            # model ------------------------------------
            self.train_args['begin_episode'] = self.models[0].get_init_episode(
            )
            if not self.train_args['inference']:
                for i, b in enumerate(self.env.brain_names):
                    records_dict = {
                        'env': self.env_args.to_dict,
                        'model': self.model_args_s[i].to_dict,
                        'buffer': self.buffer_args_s[i].to_dict,
                        'train': self.train_args.to_dict,
                        'algo': algorithm_config
                    }
                    save_config(os.path.join(base_dir, b, 'config'),
                                records_dict)
Esempio n. 8
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = rls.ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                      sigma=1.0 *
                                                      np.ones(self.a_dim),
                                                      bound=0.5)

        _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim,
                                                hidden_units['high_actor'])
        if self.is_continuous:
            _low_actor_net = lambda: rls.actor_dpg(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
        else:
            _low_actor_net = lambda: rls.actor_discrete(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        _high_critic_net = lambda: rls.critic_q_one(
            self.s_dim, self.sub_goal_dim, hidden_units['high_critic'])
        _low_critic_net = lambda: rls.critic_q_one(
            self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                'low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        self.update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
Esempio n. 9
0
def unity_run(default_args, share_args, options, max_step, max_episode,
              save_frequency, name):
    from mlagents.envs import UnityEnvironment
    from utils.sampler import create_sampler_manager

    try:
        tf_version, (model, policy_mode,
                     _) = get_model_info(options['--algorithm'])
        algorithm_config = sth.load_config(
            f'./Algorithms/{tf_version}/config.yaml')[options['--algorithm']]
        ma = options['--algorithm'][:3] == 'ma_'
    except KeyError:
        raise NotImplementedError

    reset_config = default_args['reset_config']
    if options['--unity']:
        env = UnityEnvironment()
        env_name = 'unity'
    else:
        file_name = default_args['exe_file'] if options[
            '--env'] == 'None' else options['--env']
        if os.path.exists(file_name):
            env = UnityEnvironment(file_name=file_name,
                                   base_port=int(options['--port']),
                                   no_graphics=False if options['--inference']
                                   else not options['--graphic'])
            env_dir = os.path.split(file_name)[0]
            env_name = os.path.join(*env_dir.replace('\\', '/').replace(
                r'//', r'/').split('/')[-2:])
            sys.path.append(env_dir)
            if os.path.exists(env_dir + '/env_config.py'):
                import env_config
                reset_config = env_config.reset_config
                max_step = env_config.max_step
            if os.path.exists(env_dir + '/env_loop.py'):
                from env_loop import Loop
        else:
            raise Exception('can not find this file.')
    sampler_manager, resampling_interval = create_sampler_manager(
        options['--sampler'], env.reset_parameters)

    if 'Loop' not in locals().keys():
        if ma:
            from ma_loop import Loop
        else:
            from loop import Loop

    if options['--config-file'] != 'None':
        algorithm_config = update_config(algorithm_config,
                                         options['--config-file'])
    _base_dir = os.path.join(share_args['base_dir'], env_name,
                             options['--algorithm'])
    base_dir = os.path.join(_base_dir, name)
    show_config(algorithm_config)

    brain_names = env.external_brain_names
    brains = env.brains
    brain_num = len(brain_names)

    visual_resolutions = {}
    for i in brain_names:
        if brains[i].number_visual_observations:
            visual_resolutions[f'{i}'] = [
                brains[i].camera_resolutions[0]['height'],
                brains[i].camera_resolutions[0]['width'],
                1 if brains[i].camera_resolutions[0]['blackAndWhite'] else 3
            ]
        else:
            visual_resolutions[f'{i}'] = []

    model_params = [{
        's_dim':
        brains[i].vector_observation_space_size *
        brains[i].num_stacked_vector_observations,
        'a_dim_or_list':
        brains[i].vector_action_space_size,
        'action_type':
        brains[i].vector_action_space_type,
        'max_episode':
        max_episode,
        'base_dir':
        os.path.join(base_dir, i),
        'logger2file':
        share_args['logger2file'],
        'out_graph':
        share_args['out_graph'],
    } for i in brain_names]

    if ma:
        assert brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
        data = ExperienceReplay(share_args['ma']['batch_size'],
                                share_args['ma']['capacity'])
        extra_params = {'data': data}
        models = [
            model(n=brain_num, i=i, **model_params[i], **algorithm_config)
            for i in range(brain_num)
        ]
    else:
        extra_params = {}
        models = [
            model(visual_sources=brains[i].number_visual_observations,
                  visual_resolution=visual_resolutions[f'{i}'],
                  **model_params[index],
                  **algorithm_config) for index, i in enumerate(brain_names)
        ]

    [
        models[index].init_or_restore(
            os.path.join(
                _base_dir,
                name if options['--load'] == 'None' else options['--load'], i))
        for index, i in enumerate(brain_names)
    ]
    begin_episode = models[0].get_init_episode()

    params = {
        'env': env,
        'brain_names': brain_names,
        'models': models,
        'begin_episode': begin_episode,
        'save_frequency': save_frequency,
        'reset_config': reset_config,
        'max_step': max_step,
        'max_episode': max_episode,
        'sampler_manager': sampler_manager,
        'resampling_interval': resampling_interval,
        'policy_mode': policy_mode
    }
    if 'batch_size' in algorithm_config.keys() and options['--fill-in']:
        steps = algorithm_config['batch_size']
    else:
        steps = default_args['no_op_steps']
    no_op_params = {
        'env': env,
        'brain_names': brain_names,
        'models': models,
        'brains': brains,
        'steps': steps,
        'choose': options['--noop-choose']
    }
    params.update(extra_params)
    no_op_params.update(extra_params)

    if options['--inference']:
        Loop.inference(env,
                       brain_names,
                       models,
                       reset_config=reset_config,
                       sampler_manager=sampler_manager,
                       resampling_interval=resampling_interval)
    else:
        try:
            [
                sth.save_config(os.path.join(base_dir, i, 'config'),
                                algorithm_config) for i in brain_names
            ]
            Loop.no_op(**no_op_params)
            Loop.train(**params)
        except Exception as e:
            print(e)
        finally:
            try:
                [models[i].close() for i in range(len(models))]
            except Exception as e:
                print(e)
            finally:
                env.close()
                sys.exit()
Esempio n. 10
0
File: agent.py Progetto: yyht/RLs
    def __init__(self, env_args: Config, model_args: Config,
                 buffer_args: Config, train_args: Config):
        self.env_args = env_args
        self.model_args = model_args
        self.buffer_args = buffer_args
        self.train_args = train_args

        # training control: max_train_step > max_frame_step > max_train_episode
        if self.train_args['max_train_step'] > 0:
            self.train_args['max_frame_step'] = sys.maxsize
            self.train_args['max_train_episode'] = sys.maxsize
        elif self.train_args['max_frame_step'] > 0:
            self.train_args['max_train_episode'] = sys.maxsize
        elif self.train_args['max_train_episode'] <= 0:
            raise ValueError(
                'max_train_step/max_frame_step/max_train_episode must be specified at least one with value larger than 0.'
            )

        self.train_args['inference_episode'] = self.train_args[
            'inference_episode'] if self.train_args[
                'inference_episode'] > 0 else sys.maxsize

        self.model_index = str(self.train_args.get('index'))
        self.start_time = time.time()
        self.all_learner_print = bool(
            self.train_args.get('all_learner_print', False))
        if '-' not in self.train_args['name']:
            self.train_args['name'] += f'-{self.model_index}'
        if self.model_args['load'] is None:
            self.train_args['load_model_path'] = os.path.join(
                self.train_args['base_dir'], self.train_args['name'])
        else:
            if '/' in self.model_args['load'] or '\\' in self.model_args[
                    'load']:  # 所有训练进程都以该模型路径初始化,绝对路径
                self.train_args['load_model_path'] = self.model_args['load']
            elif '-' in self.model_args['load']:
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'])  # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径
            else:  # 只写load的训练名称,不用带进程序号,会自动补
                self.train_args['load_model_path'] = os.path.join(
                    self.train_args['base_dir'],
                    self.model_args['load'] + f'-{self.model_index}')

        # ENV
        logger.info('Initialize environment begin...')
        self.env = make_env(self.env_args.to_dict)
        logger.info('Initialize environment successful.')

        # ALGORITHM CONFIG
        Model, algorithm_config, _policy_mode = get_model_info(
            self.model_args['algo'])
        self.model_args['policy_mode'] = _policy_mode
        if self.model_args['algo_config'] is not None:
            algorithm_config = UpdateConfig(algorithm_config,
                                            self.model_args['algo_config'],
                                            'algo')
        algorithm_config['use_rnn'] = self.model_args['use_rnn']
        ShowConfig(algorithm_config)

        # BUFFER
        if _policy_mode == 'off-policy':
            if algorithm_config['use_rnn'] == True:
                self.buffer_args['type'] = 'EpisodeER'
                self.buffer_args['batch_size'] = algorithm_config.get(
                    'episode_batch_size', 0)
                self.buffer_args['buffer_size'] = algorithm_config.get(
                    'episode_buffer_size', 0)

                self.buffer_args['EpisodeER'][
                    'burn_in_time_step'] = algorithm_config.get(
                        'burn_in_time_step', 0)
                self.buffer_args['EpisodeER'][
                    'train_time_step'] = algorithm_config.get(
                        'train_time_step', 0)
            else:
                self.buffer_args['batch_size'] = algorithm_config.get(
                    'batch_size', 0)
                self.buffer_args['buffer_size'] = algorithm_config.get(
                    'buffer_size', 0)

                _use_priority = algorithm_config.get('use_priority', False)
                _n_step = algorithm_config.get('n_step', False)
                if _use_priority and _n_step:
                    self.buffer_args['type'] = 'NstepPER'
                    self.buffer_args['NstepPER'][
                        'max_train_step'] = self.train_args['max_train_step']
                    self.buffer_args['NstepPER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'], self.buffer_args['NstepPER']
                        ['n'])  # update gamma for n-step training.
                elif _use_priority:
                    self.buffer_args['type'] = 'PER'
                    self.buffer_args['PER'][
                        'max_train_step'] = self.train_args['max_train_step']
                elif _n_step:
                    self.buffer_args['type'] = 'NstepER'
                    self.buffer_args['NstepER']['gamma'] = algorithm_config[
                        'gamma']
                    algorithm_config['gamma'] = pow(
                        algorithm_config['gamma'],
                        self.buffer_args['NstepER']['n'])
                else:
                    self.buffer_args['type'] = 'ER'
        else:
            self.buffer_args['type'] = 'None'
            self.train_args[
                'pre_fill_steps'] = 0  # if on-policy, prefill experience replay is no longer needed.

        # MODEL
        base_dir = os.path.join(
            self.train_args['base_dir'], self.train_args['name']
        )  # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME

        if self.env_args['type'] == 'gym':
            if self.train_args['use_wandb']:
                import wandb
                wandb_path = os.path.join(base_dir, 'wandb')
                if not os.path.exists(wandb_path):
                    os.makedirs(wandb_path)
                wandb.init(sync_tensorboard=True,
                           name=self.train_args['name'],
                           dir=base_dir,
                           project=self.train_args['wandb_project'])

            # buffer ------------------------------
            if 'Nstep' in self.buffer_args[
                    'type'] or 'Episode' in self.buffer_args['type']:
                self.buffer_args[self.buffer_args['type']][
                    'agents_num'] = self.env_args['env_num']
            self.buffer = get_buffer(self.buffer_args)
            # buffer ------------------------------

            # model -------------------------------
            model_params = {
                's_dim': self.env.s_dim,
                'visual_sources': self.env.visual_sources,
                'visual_resolution': self.env.visual_resolution,
                'a_dim': self.env.a_dim,
                'is_continuous': self.env.is_continuous,
                'max_train_step': self.train_args.max_train_step,
                'base_dir': base_dir,
                'logger2file': self.model_args.logger2file,
                'seed': self.model_args.seed,
                'n_agents': self.env.n
            }
            self.model = Model(**model_params, **algorithm_config)
            self.model.set_buffer(self.buffer)
            self.model.init_or_restore(self.train_args['load_model_path'])
            # model -------------------------------

            _train_info = self.model.get_init_training_info()
            self.train_args['begin_train_step'] = _train_info['train_step']
            self.train_args['begin_frame_step'] = _train_info['frame_step']
            self.train_args['begin_episode'] = _train_info['episode']
            if not self.train_args['inference']:
                records_dict = {
                    'env': self.env_args.to_dict,
                    'model': self.model_args.to_dict,
                    'buffer': self.buffer_args.to_dict,
                    'train': self.train_args.to_dict,
                    'algo': algorithm_config
                }
                save_config(os.path.join(base_dir, 'config'), records_dict)
                if self.train_args['use_wandb']:
                    wandb.config.update(records_dict)
        else:
            # buffer -----------------------------------
            self.buffer_args_s = []
            for i in range(self.env.brain_num):
                _bargs = deepcopy(self.buffer_args)
                if 'Nstep' in _bargs['type'] or 'Episode' in _bargs['type']:
                    _bargs[_bargs['type']][
                        'agents_num'] = self.env.brain_agents[i]
                self.buffer_args_s.append(_bargs)
            buffers = [
                get_buffer(self.buffer_args_s[i])
                for i in range(self.env.brain_num)
            ]
            # buffer -----------------------------------

            # model ------------------------------------
            self.model_args_s = []
            for i in range(self.env.brain_num):
                _margs = deepcopy(self.model_args)
                _margs['seed'] = self.model_args['seed'] + i * 10
                self.model_args_s.append(_margs)
            model_params = [
                {
                    's_dim': self.env.s_dim[i],
                    'a_dim': self.env.a_dim[i],
                    'visual_sources': self.env.visual_sources[i],
                    'visual_resolution': self.env.visual_resolutions[i],
                    'is_continuous': self.env.is_continuous[i],
                    'max_train_step': self.train_args.max_train_step,
                    'base_dir': os.path.join(base_dir, b),
                    'logger2file': self.model_args_s[i].logger2file,
                    'seed': self.model_args_s[i].
                    seed,  # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                    'n_agents': self.env.brain_agents[i],
                } for i, b in enumerate(self.env.fixed_brain_names)
            ]

            # multi agent training------------------------------------
            if self.model_args['algo'][:3] == 'ma_':
                self.ma = True
                assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
                self.ma_data = ExperienceReplay(batch_size=10, capacity=1000)
                [
                    mp.update({
                        'n': self.env.brain_num,
                        'i': i
                    }) for i, mp in enumerate(model_params)
                ]
            else:
                self.ma = False
            # multi agent training------------------------------------

            self.models = [
                Model(**model_params[i], **algorithm_config)
                for i in range(self.env.brain_num)
            ]

            [
                model.set_buffer(buffer)
                for model, buffer in zip(self.models, buffers)
            ]
            [
                self.models[i].init_or_restore(
                    os.path.join(self.train_args['load_model_path'], b))
                for i, b in enumerate(self.env.fixed_brain_names)
            ]
            # model ------------------------------------

            _train_info = self.models[0].get_init_training_info()
            self.train_args['begin_train_step'] = _train_info['train_step']
            self.train_args['begin_frame_step'] = _train_info['frame_step']
            self.train_args['begin_episode'] = _train_info['episode']
            if not self.train_args['inference']:
                for i, b in enumerate(self.env.fixed_brain_names):
                    records_dict = {
                        'env': self.env_args.to_dict,
                        'model': self.model_args_s[i].to_dict,
                        'buffer': self.buffer_args_s[i].to_dict,
                        'train': self.train_args.to_dict,
                        'algo': algorithm_config
                    }
                    save_config(os.path.join(base_dir, b, 'config'),
                                records_dict)
        pass