class Policy(Base): def __init__(self, a_dim_or_list, action_type, base_dir, s_dim, visual_sources, visual_resolution, gamma, max_episode, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__(a_dim_or_list=a_dim_or_list, action_type=action_type, base_dir=base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution ] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size self.use_priority = use_priority self.n_step = n_step self.init_data_memory() self.init_placeholders() def init_data_memory(self): ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done']) elif self.policy_mode == 'OFF': if self.use_priority: if self.n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=er_config['nper_config']['alpha'], beta=er_config['nper_config']['beta'], epsilon=er_config['nper_config']['epsilon'], agents_num=er_config['nper_config']['max_agents'], n=er_config['nper_config']['n'], global_v=er_config['nper_config']['global_v']) else: print('PER') self.data = PrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=er_config['per_config']['alpha'], beta=er_config['per_config']['beta'], epsilon=er_config['per_config']['epsilon'], global_v=er_config['nper_config']['global_v']) else: if self.n_step: print('N-Step ER') self.data = NStepExperienceReplay( self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=er_config['ner_config']['max_agents'], n=er_config['ner_config']['n']) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def init_placeholders(self): with self.graph.as_default(): self.pl_s = tf.placeholder(tf.float32, [None, self.s_dim], 'vector_observation') self.pl_a = tf.placeholder(tf.float32, [None, self.a_counts], 'pl_action') self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward') self.pl_s_ = tf.placeholder(tf.float32, [None, self.s_dim], 'next_state') self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done') self.pl_visual_s = tf.placeholder(tf.float32, [None] + self.visual_dim, 'visual_observation_') self.pl_visual_s_ = tf.placeholder(tf.float32, [None] + self.visual_dim, 'next_visual_observation') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance( a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" self.data = self.data.append( { 's': s, 'visual_s': visual_s, 'a': a, 'r': r, 's_': s_, 'visual_s_': visual_s_, 'done': done }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance( a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" self.data.add(s, visual_s, a, r, s_, visual_s_, done) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance( done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis]) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode
class Agent: def __init__(self, env_args: Config, model_args: Config, buffer_args: Config, train_args: Config): # print("89898989") self.env_args = env_args self.model_args = model_args self.buffer_args = buffer_args self.train_args = train_args self.use_GCN = False self.model_index = str(self.train_args.get('index')) self.all_learner_print = bool( self.train_args.get('all_learner_print', False)) if '-' not in self.train_args['name']: self.train_args['name'] += f'-{self.model_index}' if self.model_args['load'] is None: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.train_args['name']) else: if '/' in self.model_args['load'] or '\\' in self.model_args[ 'load']: # 所有训练进程都以该模型路径初始化,绝对路径 self.train_args['load_model_path'] = self.model_args['load'] elif '-' in self.model_args['load']: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load']) # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径 else: # 只写load的训练名称,不用带进程序号,会自动补 self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load'] + f'-{self.model_index}') # ENV self.env = make_env(self.env_args.to_dict, self.use_GCN) # ALGORITHM CONFIG Model, algorithm_config, _policy_mode = get_model_info( self.model_args['algo']) self.model_args['policy_mode'] = _policy_mode if self.model_args['algo_config'] is not None: algorithm_config = UpdateConfig(algorithm_config, self.model_args['algo_config'], 'algo') ShowConfig(algorithm_config) # BUFFER if _policy_mode == 'off-policy': self.buffer_args['batch_size'] = algorithm_config['batch_size'] self.buffer_args['buffer_size'] = algorithm_config['buffer_size'] if self.model_args['algo'] in ['drqn', 'drdqn']: self.buffer_args['type'] = 'EpisodeER' else: _use_priority = algorithm_config.get('use_priority', False) _n_step = algorithm_config.get('n_step', False) if _use_priority and _n_step: self.buffer_args['type'] = 'NstepPER' self.buffer_args['NstepPER'][ 'max_episode'] = self.train_args['max_episode'] self.buffer_args['NstepPER']['gamma'] = algorithm_config[ 'gamma'] algorithm_config['gamma'] = pow( algorithm_config['gamma'], self.buffer_args['NstepPER'] ['n']) # update gamma for n-step training. elif _use_priority: self.buffer_args['type'] = 'PER' self.buffer_args['PER']['max_episode'] = self.train_args[ 'max_episode'] elif _n_step: self.buffer_args['type'] = 'NstepER' self.buffer_args['NstepER']['gamma'] = algorithm_config[ 'gamma'] algorithm_config['gamma'] = pow( algorithm_config['gamma'], self.buffer_args['NstepER']['n']) else: self.buffer_args['type'] = 'ER' else: self.buffer_args['type'] = 'Pandas' # MODEL base_dir = os.path.join( self.train_args['base_dir'], self.train_args['name'] ) # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME if 'batch_size' in algorithm_config.keys() and train_args['fill_in']: self.train_args['pre_fill_steps'] = algorithm_config['batch_size'] if self.env_args['type'] == 'gym': self.eval_env_args = deepcopy(self.env_args) self.eval_env_args.env_num = 1 self.eval_env = make_env(self.eval_env_args.to_dict) # buffer ------------------------------ if 'Nstep' in self.buffer_args[ 'type'] or 'Episode' in self.buffer_args['type']: self.buffer_args[self.buffer_args['type']][ 'agents_num'] = self.env_args['env_num'] self.buffer = get_buffer(self.buffer_args) # buffer ------------------------------ # model ------------------------------- model_params = { 's_dim': self.env.s_dim, 'visual_sources': self.env.visual_sources, 'visual_resolution': self.env.visual_resolution, 'a_dim_or_list': self.env.a_dim_or_list, 'is_continuous': self.env.is_continuous, 'max_episode': self.train_args.max_episode, 'base_dir': base_dir, 'logger2file': self.model_args.logger2file, 'seed': self.model_args.seed } self.model = Model(**model_params, **algorithm_config) self.model.set_buffer(self.buffer) self.model.init_or_restore(self.train_args['load_model_path']) # model ------------------------------- self.train_args['begin_episode'] = self.model.get_init_episode() if not self.train_args['inference']: records_dict = { 'env': self.env_args.to_dict, 'model': self.model_args.to_dict, 'buffer': self.buffer_args.to_dict, 'train': self.train_args.to_dict, 'algo': algorithm_config } save_config(os.path.join(base_dir, 'config'), records_dict) else: # buffer ----------------------------------- self.buffer_args_s = [] for i in range(self.env.brain_num): _bargs = deepcopy(self.buffer_args) if 'Nstep' in _bargs['type'] or 'Episode' in _bargs['type']: _bargs[_bargs['type']][ 'agents_num'] = self.env.brain_agents[i] self.buffer_args_s.append(_bargs) buffers = [ get_buffer(self.buffer_args_s[i]) for i in range(self.env.brain_num) ] # buffer ----------------------------------- # model ------------------------------------ self.model_args_s = [] for i in range(self.env.brain_num): _margs = deepcopy(self.model_args) _margs['seed'] = self.model_args['seed'] + i * 10 self.model_args_s.append(_margs) model_params = [ { 's_dim': self.env.s_dim[i], 'a_dim_or_list': self.env.a_dim_or_list[i], 'visual_sources': self.env.visual_sources[i], 'visual_resolution': self.env.visual_resolutions[i], 'is_continuous': self.env.is_continuous[i], 'max_episode': self.train_args.max_episode, 'base_dir': os.path.join(base_dir, b), 'logger2file': self.model_args_s[i].logger2file, 'seed': self.model_args_s[i]. seed, # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } for i, b in enumerate(self.env.brain_names) ] # multi agent training------------------------------------ if self.model_args['algo'][:3] == 'ma_': self.ma = True assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1' self.ma_data = ExperienceReplay(batch_size=10, capacity=1000) [ mp.update({ 'n': self.env.brain_num, 'i': i }) for i, mp in enumerate(model_params) ] else: self.ma = False # multi agent training------------------------------------ self.models = [ Model(**model_params[i], **algorithm_config) for i in range(self.env.brain_num) ] [ model.set_buffer(buffer) for model, buffer in zip(self.models, buffers) ] [ self.models[i].init_or_restore( os.path.join(self.train_args['load_model_path'], b)) for i, b in enumerate(self.env.brain_names) ] # model ------------------------------------ self.train_args['begin_episode'] = self.models[0].get_init_episode( ) if not self.train_args['inference']: for i, b in enumerate(self.env.brain_names): records_dict = { 'env': self.env_args.to_dict, 'model': self.model_args_s[i].to_dict, 'buffer': self.buffer_args_s[i].to_dict, 'train': self.train_args.to_dict, 'algo': algorithm_config } save_config(os.path.join(base_dir, b, 'config'), records_dict) # print("21323232323") def pwi(self, *args): if self.all_learner_print: print(f'| Model-{self.model_index} |', *args) elif int(self.model_index) == 0: print(f'|#ONLY#Model-{self.model_index} |', *args) def __call__(self): self.train() def train(self): if self.env_args['type'] == 'gym': try: self.gym_no_op() self.gym_train() finally: self.model.close() self.env.close() else: try: if self.ma: self.ma_unity_no_op() self.ma_unity_train() else: self.unity_no_op() self.unity_train() finally: [model.close() for model in self.models] self.env.close() def evaluate(self): if self.env_args['type'] == 'gym': self.gym_inference() else: if self.ma: self.ma_unity_inference() else: self.unity_inference() def init_variables(self, evaluate=False): """ inputs: env: Environment outputs: i: specify which item of state should be modified state: [vector_obs, visual_obs] newstate: [vector_obs, visual_obs] """ if evaluate: env = self.eval_env else: env = self.env i = 1 if env.obs_type == 'visual' else 0 return i, [np.array([[]] * env.n), np.array([[]] * env.n) ], [np.array([[]] * env.n), np.array([[]] * env.n)] def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = bool(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) off_policy_step_eval = bool(self.train_args['off_policy_step_eval']) off_policy_step_eval_num = int( self.train_args.get('off_policy_step_eval_num')) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) total_step_control = bool(self.train_args['total_step_control']) max_total_step = int(self.train_args['max_total_step']) if total_step_control: max_episode = max_total_step i, state, new_state = self.init_variables() sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if off_policy_step_eval: self.gym_step_eval(total_step, self.model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.gym_random_sample(steps=add_noise2buffer_steps) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate() def gym_step_eval(self, idx, model, episodes_num, max_step): i, state, _ = self.init_variables(evaluate=True) ret = 0. ave_steps = 0. for _ in range(episodes_num): state[i] = self.eval_env.reset() r = 0. step = 0 while True: action = model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.eval_env.step(action) reward = reward[0] done = done[0] r += reward step += 1 if done or step > max_step: ret += r ave_steps += step break model.writer_summary( idx, eval_return=ret / episodes_num, eval_ave_step=ave_steps // episodes_num, ) def gym_random_sample(self, steps): i, state, new_state = self.init_variables() state[i] = self.env.reset() for _ in range(steps): action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] self.pwi('Noise added complete.') def gym_evaluate(self): max_step = int(self.train_args['max_step']) max_eval_episode = int(self.train_args['max_eval_eposide']) i, state, _ = self.init_variables() total_r = np.zeros(self.env.n) total_steps = np.zeros(self.env.n) episodes = max_eval_episode // self.env.n for _ in range(episodes): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) steps = np.zeros(self.env.n) r = np.zeros(self.env.n) while True: r_tem = np.zeros(self.env.n) action = self.model.choose_action( s=state[0], visual_s=state[1], evaluation=True ) # In the future, this method can be combined with choose_action state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False) dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] steps[unfinished_index] += 1 r += r_tem if all(dones_flag) or any(steps >= max_step): break total_r += r total_steps += steps average_r = total_r.mean() / episodes average_step = int(total_steps.mean() / episodes) solved = True if average_r >= self.env.reward_threshold else False self.pwi( f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}' ) self.pwi( '----------------------------------------------------------------------------------------------------------------------------' ) def gym_no_op(self): steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' i, state, new_state = self.init_variables() state[i] = self.env.reset() steps = steps // self.env.n + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: action = self.model.choose_action(s=state[0], visual_s=state[1]) else: action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] def gym_inference(self): i, state, _ = self.init_variables() while True: state[i] = self.env.reset() while True: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.env.step(action) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset state[i][self.env.dones_index] = self.env.partial_reset() def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) if self.use_GCN: adj, x, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 6) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) adj[i] = _adj x[i] = _x visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( adj=adj[i], x=x[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] adj[i] = _adj x[i] = _x visual_state[i] = _vs if policy_mode == 'off-policy': # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps) else: state, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 5) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) state[i] = _v visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] state[i] = _v visual_state[i] = _vs if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps) def unity_random_sample(self, steps): if self.use_GCN: adj, x, visual_state = zeros_initializer(self.env.brain_num, 3) ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): adj[i] = _adj x[i] = _x visual_state[i] = _vs for _ in range(steps): action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].store_data_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) adj[i] = _adj x[i] = _x visual_state[i] = _vs self.pwi('Noise added complete.') else: state, visual_state = zeros_initializer(self.env.brain_num, 2) ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v visual_state[i] = _vs for _ in range(steps): action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].store_data(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) state[i] = _v visual_state[i] = _vs self.pwi('Noise added complete.') def unity_no_op(self): ''' Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer. Make sure steps is greater than n-step if using any n-step ReplayBuffer. ''' steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' if self.use_GCN: adj, x, visual_state, action = zeros_initializer( self.env.brain_num, 4) ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): adj[i] = _adj x[i] = _x visual_state[i] = _vs steps = steps // min(self.env.brain_agents) + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( adj=adj[i], x=x, visual_s=visual_state[i]) else: action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) print( "77777777777777777777777777777777777777777777777777777777777777777" ) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].no_op_store_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) adj[i] = _adj x[i] = _x visual_state[i] = _vs else: state, visual_state, action = zeros_initializer( self.env.brain_num, 3) ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v visual_state[i] = _vs steps = steps // min(self.env.brain_agents) + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) else: action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].no_op_store(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) state[i] = _v visual_state[i] = _vs def unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ if self.use_GCN: action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action( adj=_adj, x=_x, visual_s=_vs, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) else: action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action( s=_v, visual_s=_vs, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) def ma_unity_no_op(self): steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance(steps, int), 'multi-agent no_op.steps must have type of int' if steps < self.ma_data.batch_size: steps = self.ma_data.batch_size state, action, reward, next_state, dones = zeros_initializer( self.env.brain_num, 5) ObsRewDone = self.env.reset(train_mode=False) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v for i in range(self.env.brain_num): # initialize actions to zeros if self.env.is_continuous[i]: action[i] = np.zeros( (self.env.brain_agents[i], self.env.a_dim_or_list[i][0]), dtype=np.int32) else: action[i] = np.zeros( (self.env.brain_agents[i], len(self.env.a_dim_or_list[i])), dtype=np.int32) a = [np.asarray(e) for e in zip(*action)] for step in range(steps): self.pwi(f'no op step {step}') for i in range(self.env.brain_num): if choose: action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): reward[i] = _r[:, np.newaxis] next_state[i] = _vs dones[i] = _d[:, np.newaxis] def func(x): return [np.asarray(e) for e in zip(*x)] s, a, r, s_, done = map(func, [state, action, reward, next_state, dones]) self.ma_data.add(s, a, r, s_, done) for i in range(self.env.brain_num): state[i] = next_state[i] def ma_unity_train(self): begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only." batch_size = self.ma_data.batch_size state, action, new_action, next_action, reward, next_state, dones, dones_flag, rewards = zeros_initializer( self.env.brain_num, 9) for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) state[i] = _v step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): reward[i] = _r[:, np.newaxis] next_state = _v dones[i] = _d[:, np.newaxis] unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d rewards[i][unfinished_index] += _r[unfinished_index] def func(x): return [np.asarray(e) for e in zip(*x)] s, a, r, s_, done = map( func, [state, action, reward, next_state, dones]) self.ma_data.add(s, a, r, s_, done) for i in range(self.env.brain_num): state[i] = next_state[i] s, a, r, s_, done = self.ma_data.sample() for i, brain_name in enumerate(self.env.brain_names): next_action[i] = self.models[i].get_target_action(s=s_[:, i]) new_action[i] = self.models[i].choose_action( s=s[:, i], evaluation=True) a_ = np.asarray([np.asarray(e) for e in zip(*next_action)]) if policy_mode == 'off-policy': for i in range(self.env.brain_num): self.models[i].learn( episode=episode, ap=np.asarray([ np.asarray(e) for e in zip(*next_action[:i]) ]).reshape(batch_size, -1) if i != 0 else np.zeros( (batch_size, 0)), al=np.asarray([ np.asarray(e) for e in zip( *next_action[-(self.env.brain_num - i - 1):]) ]).reshape(batch_size, -1) if self.env.brain_num - i != 1 else np.zeros( (batch_size, 0)), ss=s.reshape(batch_size, -1), ss_=s_.reshape(batch_size, -1), aa=a.reshape(batch_size, -1), aa_=a_.reshape(batch_size, -1), s=s[:, i], r=r[:, i]) if all([all(dones_flag[i]) for i in range(self.env.brain_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): self.models[i].writer_summary(episode, total_reward=rewards[i].mean(), step=last_done_step) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}' ) if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) def ma_unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action(s=_v, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions)
class MCTS_POLICY(RL_Policy): def __init__(self, state_dim, learning_rate=5.0e-4, buffer_size=10000, batch_size=128, epochs=2, name='wjs_policy', cp_dir='./models'): super().__init__(cp_dir=cp_dir) self.lr = learning_rate self.epochs = epochs self.data = ExperienceReplay(batch_size=batch_size, capacity=buffer_size) self.net = PV(state_dim=state_dim, name='pv_net') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr) @tf.function def _get_probs_and_v(self, state): with tf.device(self.device): state = tf.transpose(state, [0, 2, 3, 1]) return self.net(state) def get_probs_and_v(self, game): ''' 输入状态,获得相应可选动作的概率和当前节点的预期价值 ''' state = game.get_current_state().reshape(-1, 4, game.box_size, game.box_size) log_actions_prob, value = self._get_probs_and_v(state) actions_prob = np.exp(log_actions_prob) a, b = game.get_available_actions() available_actions_prob = zip(a, actions_prob[0][b]) return available_actions_prob, value def learn(self): if self.data.is_lg_batch_size: s, p, v = self.data.sample() for i in range(self.epochs): summaries = self.train(s, p, v) loss = summaries['LOSS/loss'] logging.info(f'epoch: {i}, loss: {loss}') tf.summary.experimental.set_step(self.global_step) self.write_training_summaries(summaries) tf.summary.scalar('LEARNING_RATE/lr', self.lr) self.writer.flush() @tf.function def train(self, s, p, v): s = tf.cast(s, tf.float32) p = tf.cast(p, tf.float32) v = tf.cast(v, tf.float32) with tf.device(self.device): s = tf.transpose(s, [0, 2, 3, 1]) with tf.GradientTape() as tape: log_action_probs, predict_v = self.net(s) p_loss = -tf.reduce_mean( tf.reduce_sum(tf.multiply(p, log_action_probs), axis=-1)) v_loss = tf.reduce_mean((v - predict_v)**2) l2_penalty = 1e-4 * tf.add_n([ tf.nn.l2_loss(v) for v in self.net.trainable_variables if 'bias' not in v.name.lower() ]) loss = v_loss + p_loss + l2_penalty grads = tape.gradient(loss, self.net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.net.trainable_variables)) self.global_step.assign_add(1) return dict([ ['LOSS/v_loss', v_loss], ['LOSS/p_loss', p_loss], ['LOSS/loss', loss], ]) def store(self, data: list): for i in data: self.data.add(i) def store_in_file(self, data, file_name='./data/data'): with open(f'{file_name}.data', 'a') as f: for i in data: json_str = json.dumps([d.tolist() for d in i]) # 将一条经验转换为list f.write(json_str + '\n') # 保存一条经验 def _restore_from_file(self, data, file_name='./data/data'): with open(f'{file_name}.data') as f: for json_str in f: # 每行为一条经验 if json_str != '': data = json.loads(json_str) data = [np.array(d) for d in data] # 一条经验 self.data.add(data) # 恢复一条经验
class HIRO(make_off_policy_class(mode='no_share')): ''' Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode) def generate_ir_func(self, mode='os'): if mode == 'os': return lambda last_feat, subgoal, feat: -tf.norm( last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True) elif mode == 'cos': return lambda last_feat, subgoal, feat: tf.expand_dims( -tf.keras.losses.cosine_similarity( tf.cast(feat - last_feat, tf.float32), tf.cast(subgoal, tf.float32), axis=-1), axis=-1) def show_logo(self): self.recorder.logger.info(''' xxxxx xxxxx xxxx xxxxxxx xxxxxx xx xx xx xxxxxxx xxx xxxx xx xx xx xx xxx xxx xxx xx xx xx xx xxx xx xxx xxxxxxx xx xxxxxx xx xxx xx xx xx xxxxxx xx xxx xx xx xx xx xxxx xx xxx xx xx xx xx xxx xxx xxx xxxxx xxxxx xxxx xxxxx xxxx xxxxxxx ''') def store_high_buffer(self, i): eps_len = len(self._high_s[i]) intervals = list(range(0, eps_len, self.sub_goal_steps)) if len(intervals) < 1: return left = intervals[:-1] right = intervals[1:] s, r, a, g, d, s_ = [], [], [], [], [], [] for _l, _r in zip(left, right): s.append(self._high_s[i][_l:_r]) r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale) a.append(self._high_a[i][_l:_r]) g.append(self._subgoals[i][_l]) d.append(self._done[i][_r - 1]) s_.append(self._high_s_[i][_r - 1]) right = intervals[-1] s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] * (self.sub_goal_steps + right - eps_len)) r.append(sum(self._high_r[i][right:eps_len])) a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] * (self.sub_goal_steps + right - eps_len)) g.append(self._subgoals[i][right]) d.append(self._done[i][-1]) s_.append(self._high_s_[i][-1]) self.data_high.add(np.array(s), np.array(r)[:, np.newaxis], np.array(a), np.array(g), np.array(d)[:, np.newaxis], np.array(s_)) def reset(self): self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32) for i in range(self.n_agents): self.store_high_buffer(i) self._high_r = [[] for _ in range(self.n_agents)] self._high_a = [[] for _ in range(self.n_agents)] self._high_s = [[] for _ in range(self.n_agents)] self._subgoals = [[] for _ in range(self.n_agents)] self._done = [[] for _ in range(self.n_agents)] self._high_s_ = [[] for _ in range(self.n_agents)] self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim), dtype=np.float32) def partial_reset(self, done): self._c = np.where( done[:, np.newaxis], np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c) idx = np.where(done)[0] for i in idx: self.store_high_buffer(i) self._high_s[i] = [] self._high_a[i] = [] self._high_s_[i] = [] self._high_r[i] = [] self._done[i] = [] self._subgoals[i] = [] @tf.function def _get_action(self, s, visual_s, subgoal): with tf.device(self.device): feat = tf.concat([s, subgoal], axis=-1) if self.is_continuous: mu = self.low_actor(feat) pi = tf.clip_by_value(mu + self.low_noise(), -1, 1) else: logits = self.low_actor(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfd.Categorical(logits) pi = cate_dist.sample() return mu, pi def choose_action(self, s, visual_s, evaluation=False): self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(s).numpy(), self._new_subgoal) mu, pi = self._get_action(s, visual_s, self._subgoal) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def get_subgoal(self, s): ''' last_s 上一个隐状态 subgoal 上一个子目标 s 当前隐状态 ''' new_subgoal = self.high_scale * self.high_actor(s) new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(), -self.high_scale, self.high_scale) return new_subgoal def learn(self, **kwargs): self.episode = kwargs['episode'] for i in range(kwargs['step']): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions( self.data_low, data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_']) high_data = self.get_transitions( self.data_high, data_name_list=['s', 'r', 'a', 'g', 'done', 's_']) # --------------------------------------获取需要传给train函数的参数 _low_training_data = self.get_value_from_dict( data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'], data_dict=low_data) _high_training_data = self.get_value_from_dict( data_name_list=['s', 'r', 'a', 'g', 'done', 's_'], data_dict=high_data) summaries = self.train_low(_low_training_data) self.summaries.update(summaries) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights, self.low_actor.weights + self.low_critic.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(_high_training_data) self.summaries.update(high_summaries) self.update_target_net_weights( self.high_actor_target.weights + self.high_critic_target.weights, self.high_actor.weights + self.high_critic.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.episode) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.episode) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.episode) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.episode) ]])) self.write_training_summaries(self.global_step, self.summaries) @tf.function(experimental_relax_shapes=True) def train_low(self, memories): s, a, r, s_, done, g, g_ = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = tf.concat([s, g], axis=-1) feat_ = tf.concat([s_, g_], axis=-1) if self.is_continuous: target_mu = self.low_actor_target(feat_) action_target = tf.clip_by_value( target_mu + self.low_noise(), -1, 1) else: target_logits = self.low_actor_target(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [tf.shape(feat_)[0], self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.low_critic(feat, a) q = tf.minimum(q1, q2) q_target = self.low_critic_target.get_min(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) low_critic_loss = q1_loss + q2_loss low_critic_grads = tape.gradient(low_critic_loss, self.low_critic.weights) self.low_critic_optimizer.apply_gradients( zip(low_critic_grads, self.low_critic.weights)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.low_actor(feat) else: logits = self.low_actor(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_actor = self.low_critic.Q1(feat, mu) low_actor_loss = -tf.reduce_mean(q_actor) low_actor_grads = tape.gradient(low_actor_loss, self.low_actor.trainable_variables) self.low_actor_optimizer.apply_gradients( zip(low_actor_grads, self.low_actor.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/low_actor_loss', low_actor_loss], ['LOSS/low_critic_loss', low_critic_loss], ['Statistics/low_q_min', tf.reduce_min(q)], ['Statistics/low_q_mean', tf.reduce_mean(q)], ['Statistics/low_q_max', tf.reduce_max(q)]]) @tf.function(experimental_relax_shapes=True) def train_high(self, memories): # s_ : [B, N] ss, r, aa, g, done, s_ = memories batchs = tf.shape(ss)[0] # ss, aa [B, T, *] with tf.device(self.device): with tf.GradientTape() as tape: s = ss[:, 0] # [B, N] true_end = (s_ - s)[:, self.fn_goal_dim:] g_dist = tfd.Normal(loc=true_end, scale=0.5 * self.high_scale[None, :]) ss = tf.expand_dims(ss, 0) # [1, B, T, *] ss = tf.tile(ss, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]]) # [10*B*T, *] aa = tf.expand_dims(aa, 0) # [1, B, T, *] aa = tf.tile(aa, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]]) # [10*B*T, *] gs = tf.concat([ tf.expand_dims(g, 0), tf.expand_dims(true_end, 0), tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2), -self.high_scale, self.high_scale) ], axis=0) # [10, B, N] all_g = gs + s[:, self.fn_goal_dim:] all_g = tf.expand_dims(all_g, 2) # [10, B, 1, N] all_g = tf.tile( all_g, [1, 1, self.sub_goal_steps, 1]) # [10, B, T, N] all_g = tf.reshape(all_g, [-1, tf.shape(all_g)[-1]]) # [10*B*T, N] all_g = all_g - ss[:, self.fn_goal_dim:] # [10*B*T, N] feat = tf.concat([ss, all_g], axis=-1) # [10*B*T, *] _aa = self.low_actor(feat) # [10*B*T, A] if not self.is_continuous: _aa = tf.one_hot(tf.argmax(_aa, axis=-1), self.a_dim, dtype=tf.float32) diff = _aa - aa diff = tf.reshape( diff, [self.sample_g_nums, batchs, self.sub_goal_steps, -1 ]) # [10, B, T, A] diff = tf.transpose(diff, [1, 0, 2, 3]) # [B, 10, T, A] logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2, axis=-1) # [B, 10] idx = tf.argmax(logps, axis=-1, output_type=tf.int32) idx = tf.stack([tf.range(batchs), idx], axis=1) # [B, 2] g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx) # [B, N] q1, q2 = self.high_critic(s, g) q = tf.minimum(q1, q2) target_sub_goal = self.high_actor_target(s_) * self.high_scale target_sub_goal = tf.clip_by_value( target_sub_goal + self.high_noise(), -self.high_scale, self.high_scale) q_target = self.high_critic_target.get_min(s_, target_sub_goal) dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) high_critic_loss = q1_loss + q2_loss high_critic_grads = tape.gradient(high_critic_loss, self.high_critic.weights) self.high_critic_optimizer.apply_gradients( zip(high_critic_grads, self.high_critic.weights)) with tf.GradientTape() as tape: mu = self.high_actor(s) * self.high_scale q_actor = self.high_critic.Q1(s, mu) high_actor_loss = -tf.reduce_mean(q_actor) high_actor_grads = tape.gradient( high_actor_loss, self.high_actor.trainable_variables) self.high_actor_optimizer.apply_gradients( zip(high_actor_grads, self.high_actor.trainable_variables)) return dict([['LOSS/high_actor_loss', high_actor_loss], ['LOSS/high_critic_loss', high_critic_loss], ['Statistics/high_q_min', tf.reduce_min(q)], ['Statistics/high_q_mean', tf.reduce_mean(q)], ['Statistics/high_q_max', tf.reduce_max(q)]]) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._noop_subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal, s_[:, self.fn_goal_dim:]) # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:] subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._noop_subgoal, subgoal) self._noop_subgoal = subgoal def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal, s_[:, self.fn_goal_dim:]) self._new_subgoal = np.where( self._c == 1, self.get_subgoal(s_).numpy(), s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:]) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._subgoal, self._new_subgoal) self._c = np.where( self._c == 1, np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c - 1) def get_transitions(self, databuffer, data_name_list=['s', 'a', 'r', 's_', 'done']): ''' TODO: Annotation ''' data = databuffer.sample() # 经验池取数据 if not self.is_continuous and 'a' in data_name_list: a_idx = data_name_list.index('a') a = data[a_idx].astype(np.int32) pre_shape = a.shape a = a.reshape(-1) a = sth.int2one_hot(a, self.a_dim) a = a.reshape(pre_shape + (-1, )) data[a_idx] = a return dict([[ n, d ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
class Agent: def __init__(self, env_args, model_args, buffer_args, train_args): self.env_args = env_args self.model_args = model_args self.buffer_args = buffer_args self.train_args = train_args self.model_index = str(self.train_args.get('index')) self.all_learner_print = bool( self.train_args.get('all_learner_print', False)) self.train_args['name'] += f'-{self.model_index}' if self.model_args['load'] is None: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.train_args['name']) else: if '/' in self.model_args['load'] or '\\' in self.model_args[ 'load']: # 所有训练进程都以该模型路径初始化,绝对路径 self.train_args['load_model_path'] = self.model_args['load'] elif '-' in self.model_args['load']: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load']) # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径 else: # 只写load的训练名称,不用带进程序号,会自动补 self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load'] + f'-{self.model_index}') # ENV self.env = make_env(self.env_args) # ALGORITHM CONFIG Model, algorithm_config, _policy_mode = get_model_info( self.model_args['algo']) self.model_args['policy_mode'] = _policy_mode if self.model_args['algo_config'] is not None: algorithm_config = UpdateConfig(algorithm_config, self.model_args['algo_config'], 'algo') ShowConfig(algorithm_config) # BUFFER if _policy_mode == 'off-policy': self.buffer_args['batch_size'] = algorithm_config['batch_size'] self.buffer_args['buffer_size'] = algorithm_config['buffer_size'] _use_priority = algorithm_config.get('use_priority', False) _n_step = algorithm_config.get('n_step', False) if _use_priority and _n_step: self.buffer_args['type'] = 'NSTEP-PER' self.buffer_args['NSTEP-PER']['max_episode'] = self.train_args[ 'max_episode'] self.buffer_args['NSTEP-PER']['gamma'] = algorithm_config[ 'gamma'] elif _use_priority: self.buffer_args['type'] = 'PER' self.buffer_args['PER']['max_episode'] = self.train_args[ 'max_episode'] elif _n_step: self.buffer_args['type'] = 'NSTEP-ER' self.buffer_args['NSTEP-ER']['gamma'] = algorithm_config[ 'gamma'] else: self.buffer_args['type'] = 'ER' else: self.buffer_args['type'] = 'Pandas' # MODEL base_dir = os.path.join( self.train_args['base_dir'], self.train_args['name'] ) # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME if 'batch_size' in algorithm_config.keys() and train_args['fill_in']: self.train_args['no_op_steps'] = algorithm_config['batch_size'] else: self.train_args['no_op_steps'] = train_args['no_op_steps'] if self.env_args['type'] == 'gym': # buffer ------------------------------ if 'NSTEP' in self.buffer_args['type']: self.buffer_args[self.buffer_args['type']][ 'agents_num'] = self.env_args['env_num'] self.buffer = get_buffer(self.buffer_args) # buffer ------------------------------ # model ------------------------------- model_params = { 's_dim': self.env.s_dim, 'visual_sources': self.env.visual_sources, 'visual_resolution': self.env.visual_resolution, 'a_dim_or_list': self.env.a_dim_or_list, 'is_continuous': self.env.is_continuous, 'max_episode': self.train_args['max_episode'], 'base_dir': base_dir, 'logger2file': self.model_args['logger2file'], 'seed': self.model_args['seed'] } self.model = Model(**model_params, **algorithm_config) self.model.set_buffer(self.buffer) self.model.init_or_restore( os.path.join(self.train_args['load_model_path'])) # model ------------------------------- self.train_args['begin_episode'] = self.model.get_init_episode() if not self.train_args['inference']: records_dict = { 'env': self.env_args, 'model': self.model_args, 'buffer': self.buffer_args, 'train': self.train_args, 'algo': algorithm_config } save_config(os.path.join(base_dir, 'config'), records_dict) else: # buffer ----------------------------------- self.buffer_args_s = [] for i in range(self.env.brain_num): _bargs = deepcopy(self.buffer_args) if 'NSTEP' in _bargs['type']: _bargs[_bargs['type']][ 'agents_num'] = self.env.brain_agents[i] self.buffer_args_s.append(_bargs) buffers = [ get_buffer(self.buffer_args_s[i]) for i in range(self.env.brain_num) ] # buffer ----------------------------------- # model ------------------------------------ self.model_args_s = [] for i in range(self.env.brain_num): _margs = deepcopy(self.model_args) _margs['seed'] = self.model_args['seed'] + i * 10 self.model_args_s.append(_margs) model_params = [ { 's_dim': self.env.s_dim[i], 'a_dim_or_list': self.env.a_dim_or_list[i], 'visual_sources': self.env.visual_sources[i], 'visual_resolution': self.env.visual_resolutions[i], 'is_continuous': self.env.is_continuous[i], 'max_episode': self.train_args['max_episode'], 'base_dir': os.path.join(base_dir, b), 'logger2file': self.model_args_s[i]['logger2file'], 'seed': self.model_args_s[i] ['seed'], # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } for i, b in enumerate(self.env.brain_names) ] # multi agent training------------------------------------ if self.model_args['algo'][:3] == 'ma_': self.ma = True assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1' self.ma_data = ExperienceReplay(batch_size=10, capacity=1000) [ mp.update({ 'n': self.env.brain_num, 'i': i }) for i, mp in enumerate(model_params) ] else: self.ma = False # multi agent training------------------------------------ self.models = [ Model(**model_params[i], **algorithm_config) for i in range(self.env.brain_num) ] [ model.set_buffer(buffer) for model, buffer in zip(self.models, buffers) ] [ self.models[i].init_or_restore( os.path.join(self.train_args['load_model_path'], b)) for i, b in enumerate(self.env.brain_names) ] # model ------------------------------------ self.train_args['begin_episode'] = self.models[0].get_init_episode( ) if not self.train_args['inference']: for i, b in enumerate(self.env.brain_names): records_dict = { 'env': self.env_args, 'model': self.model_args_s[i], 'buffer': self.buffer_args_s[i], 'train': self.train_args, 'algo': algorithm_config } save_config(os.path.join(base_dir, b, 'config'), records_dict) pass def pwi(self, *args): if self.all_learner_print: print(f'| Model-{self.model_index} |', *args) elif int(self.model_index) == 0: print(f'|#ONLY#Model-{self.model_index} |', *args) def __call__(self): self.train() def train(self): if self.env_args['type'] == 'gym': try: self.gym_no_op() self.gym_train() finally: self.model.close() self.env.close() else: try: if self.ma: self.ma_unity_no_op() self.ma_unity_train() else: self.unity_no_op() self.unity_train() finally: [model.close() for model in self.models] self.env.close() def evaluate(self): if self.env_args['type'] == 'gym': self.gym_inference() else: if self.ma: self.ma_unity_inference() else: self.unity_inference() def init_variables(self): """ inputs: env: Environment outputs: i: specify which item of state should be modified state: [vector_obs, visual_obs] newstate: [vector_obs, visual_obs] """ i = 1 if self.env.obs_type == 'visual' else 0 return i, [np.array([[]] * self.env.n), np.array([[]] * self.env.n)], [ np.array([[]] * self.env.n), np.array([[]] * self.env.n) ] def get_visual_input(self, n, cameras, brain_obs): ''' inputs: n: agents number cameras: camera number brain_obs: observations of specified brain, include visual and vector observation. output: [vector_information, [visual_info0, visual_info1, visual_info2, ...]] ''' ss = [] for j in range(n): s = [] for k in range(cameras): s.append(brain_obs.visual_observations[k][j]) ss.append(np.array(s)) return np.array(ss) def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = int(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) policy_mode = str(self.model_args['policy_mode']) i, state, new_state = self.init_variables() sma = SMA(100) for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate() def gym_evaluate(self): max_step = int(self.train_args['max_step']) max_eval_episode = int(self.train_args['max_eval_eposide']) i, state, _ = self.init_variables() total_r = np.zeros(self.env.n) total_steps = np.zeros(self.env.n) episodes = max_eval_episode // self.env.n for _ in range(episodes): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) steps = np.zeros(self.env.n) r = np.zeros(self.env.n) while True: r_tem = np.zeros(self.env.n) action = self.model.choose_action( s=state[0], visual_s=state[1], evaluation=True ) # In the future, this method can be combined with choose_action state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False) dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] steps[unfinished_index] += 1 r += r_tem if all(dones_flag) or any(steps >= max_step): break total_r += r total_steps += steps average_r = total_r.mean() / episodes average_step = int(total_steps.mean() / episodes) solved = True if average_r >= self.env.reward_threshold else False self.pwi( f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}' ) self.pwi( '----------------------------------------------------------------------------------------------------------------------------' ) def gym_no_op(self): steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' i, state, new_state = self.init_variables() state[i] = self.env.reset() steps = steps // self.env.n + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: action = self.model.choose_action(s=state[0], visual_s=state[1]) else: action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] def gym_inference(self): i, state, _ = self.init_variables() while True: state[i] = self.env.reset() while True: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.env.step(action) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset state[i][self.env.dones_index] = self.env.partial_reset() def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. agents_num: use to record 'number' of agents for each brain. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num action = [0] * brains_num dones_flag = [0] * brains_num agents_num = [0] * brains_num rewards = [0] * brains_num sma = [SMA(100) for i in range(brains_num)] for episode in range(begin_episode, max_episode): obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) dones_flag[i] = np.zeros(agents_num[i]) rewards[i] = np.zeros(agents_num[i]) step = 0 last_done_step = -1 while True: step += 1 for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += obs[brain_name].local_done next_state = obs[brain_name].vector_observations next_visual_state = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) self.models[i].store_data( s=state[i], visual_s=visual_state[i], a=action[i], r=np.asarray(obs[brain_name].rewards), s_=next_state, visual_s_=next_visual_state, done=np.asarray(obs[brain_name].local_done)) rewards[i][unfinished_index] += np.asarray( obs[brain_name].rewards)[unfinished_index] if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([all(dones_flag[i]) for i in range(brains_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(brains_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary(episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(brains_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(brains_num): self.models[i].save_checkpoint(episode) def unity_no_op(self): ''' Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer. Make sure steps is greater than n-step if using any n-step ReplayBuffer. ''' steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num agents_num = [0] * brains_num action = [0] * brains_num obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): # initialize actions to zeros agents_num[i] = len(obs[brain_name].agents) if self.env.brains[ brain_name].vector_action_space_type == 'continuous': action[i] = np.zeros( (agents_num[i], self.env.brains[brain_name].vector_action_space_size[0]), dtype=np.int32) else: action[i] = np.zeros(( agents_num[i], len(self.env.brains[brain_name].vector_action_space_size)), dtype=np.int32) steps = steps // min(agents_num) + 1 for step in range(steps): self.pwi(f'no op step {step}') for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) if choose: action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): next_state = obs[brain_name].vector_observations next_visual_state = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) self.models[i].no_op_store( s=state[i], visual_s=visual_state[i], a=action[i], r=np.asarray(obs[brain_name].rewards), s_=next_state, visual_s_=next_visual_state, done=np.asarray(obs[brain_name].local_done)) def unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num action = [0] * brains_num agents_num = [0] * brains_num while True: obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) while True: for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.modes[i].visual_sources, obs[brain_name]) action[i] = self.modes[i].choose_action( s=state[i], visual_s=visual_state[i], evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) def ma_unity_no_op(self): steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance(steps, int), 'multi-agent no_op.steps must have type of int' if steps < self.ma_data.batch_size: steps = self.ma_data.batch_size brains_num = len(self.env.brain_names) agents_num = [0] * brains_num state = [0] * brains_num action = [0] * brains_num reward = [0] * brains_num next_state = [0] * brains_num dones = [0] * brains_num obs = self.env.reset(train_mode=False) for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) if self.env.brains[ brain_name].vector_action_space_type == 'continuous': action[i] = np.zeros( (agents_num[i], self.env.brains[brain_name].vector_action_space_size[0]), dtype=np.int32) else: action[i] = np.zeros(( agents_num[i], len(self.env.brains[brain_name].vector_action_space_size)), dtype=np.int32) a = [np.asarray(e) for e in zip(*action)] for step in range(steps): print(f'no op step {step}') for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations if choose: action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): reward[i] = np.asarray(obs[brain_name].rewards)[:, np.newaxis] next_state[i] = obs[brain_name].vector_observations dones[i] = np.asarray(obs[brain_name].local_done)[:, np.newaxis] s = [np.asarray(e) for e in zip(*state)] a = [np.asarray(e) for e in zip(*action)] r = [np.asarray(e) for e in zip(*reward)] s_ = [np.asarray(e) for e in zip(*next_state)] done = [np.asarray(e) for e in zip(*dones)] self.ma_data.add(s, a, r, s_, done) def ma_unity_train(self): begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only." brains_num = len(self.env.brain_names) batch_size = self.ma_data.batch_size agents_num = [0] * brains_num state = [0] * brains_num action = [0] * brains_num new_action = [0] * brains_num next_action = [0] * brains_num reward = [0] * brains_num next_state = [0] * brains_num dones = [0] * brains_num dones_flag = [0] * brains_num rewards = [0] * brains_num for episode in range(begin_episode, max_episode): obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) dones_flag[i] = np.zeros(agents_num[i]) rewards[i] = np.zeros(agents_num[i]) step = 0 last_done_step = -1 while True: step += 1 for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): reward[i] = np.asarray(obs[brain_name].rewards)[:, np.newaxis] next_state[i] = obs[brain_name].vector_observations dones[i] = np.asarray( obs[brain_name].local_done)[:, np.newaxis] unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += obs[brain_name].local_done rewards[i][unfinished_index] += np.asarray( obs[brain_name].rewards)[unfinished_index] s = [np.asarray(e) for e in zip(*state)] a = [np.asarray(e) for e in zip(*action)] r = [np.asarray(e) for e in zip(*reward)] s_ = [np.asarray(e) for e in zip(*next_state)] done = [np.asarray(e) for e in zip(*dones)] self.ma_data.add(s, a, r, s_, done) s, a, r, s_, done = self.ma_data.sample() for i, brain_name in enumerate(self.env.brain_names): next_action[i] = self.models[i].get_target_action(s=s_[:, i]) new_action[i] = self.models[i].choose_action( s=s[:, i], evaluation=True) a_ = np.asarray([np.asarray(e) for e in zip(*next_action)]) if policy_mode == 'off-policy': for i in range(brains_num): self.models[i].learn( episode=episode, ap=np.asarray([ np.asarray(e) for e in zip(*next_action[:i]) ]).reshape(batch_size, -1) if i != 0 else np.zeros( (batch_size, 0)), al=np.asarray([ np.asarray(e) for e in zip( *next_action[-(brains_num - i - 1):]) ]).reshape(batch_size, -1) if brains_num - i != 1 else np.zeros( (batch_size, 0)), ss=s.reshape(batch_size, -1), ss_=s_.reshape(batch_size, -1), aa=a.reshape(batch_size, -1), aa_=a_.reshape(batch_size, -1), s=s[:, i], r=r[:, i]) if all([all(dones_flag[i]) for i in range(brains_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break # if train_mode == 'perEpisode': # for i in range(brains_num): # self.models[i].learn(episode) for i in range(brains_num): self.models[i].writer_summary(episode, total_reward=rewards[i].mean(), step=last_done_step) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}' ) if episode % save_frequency == 0: for i in range(brains_num): self.models[i].save_checkpoint(episode) def ma_unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ brains_num = len(self.env.brain_names) state = [0] * brains_num action = [0] * brains_num while True: obs = self.env.reset() while True: for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations action[i] = self.models[i].choose_action(s=state[i], evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions)
class Off_Policy(Policy): def __init__(self, s_dim, visual_sources, visual_resolution, a_dim_or_list, is_continuous, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim_or_list=a_dim_or_list, is_continuous=is_continuous, **kwargs) self.batch_size = int(kwargs.get('batch_size', 128)) self.buffer_size = int(kwargs.get('buffer_size', 10000)) self.use_priority = kwargs.get('use_priority', False) self.n_step = kwargs.get('n_step', False) self.init_data_memory() def init_data_memory(self): if self.use_priority: if self.n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=er_config['nper_config']['alpha'], beta=er_config['nper_config']['beta'], epsilon=er_config['nper_config']['epsilon'], agents_num=er_config['nper_config']['max_agents'], n=er_config['nper_config']['n'], global_v=er_config['nper_config']['global_v']) else: print('PER') self.data = PrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=er_config['per_config']['alpha'], beta=er_config['per_config']['beta'], epsilon=er_config['per_config']['epsilon'], global_v=er_config['nper_config']['global_v']) else: if self.n_step: print('N-Step ER') self.data = NStepExperienceReplay( self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=er_config['ner_config']['max_agents'], n=er_config['ner_config']['n']) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" if not self.is_continuous: a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis]) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance( done, np.ndarray), "no_op_store need done type is np.ndarray" if not self.is_continuous: a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis])
class Policy(Base): def __init__(self, s_dim, visual_sources, visual_resolution, a_dim_or_list, action_type, gamma, max_episode, base_dir, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__(a_dim_or_list, action_type, base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution ] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 's_', 'done']) elif self.policy_mode == 'OFF': if use_priority: if n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=0.6, beta=0.2, epsilon=0.01, agents_num=20, n=4) else: print('PER') self.data = PrioritizedExperienceReplay( self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=0.6, beta=0.2, epsilon=0.01) else: if n_step: print('N-Step ER') self.data = NStepExperienceReplay(self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=20, n=4) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance( a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" self.data = self.data.append( { 's': s, 'visual_s': visual_s, 'a': a, 'r': r, 's_': s_, 'visual_s_': visual_s_, 'done': done }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance( a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" self.data.add(s, visual_s, a, r, s_, visual_s_, done) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance( r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance( done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis]) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode
class Policy(Base): def __init__(self, a_dim_or_list, action_type, base_dir, s_dim, visual_sources, visual_resolution, gamma, max_episode, policy_mode=None, batch_size=1, buffer_size=1, use_priority=False, n_step=False): super().__init__( a_dim_or_list=a_dim_or_list, action_type=action_type, base_dir=base_dir) self.s_dim = s_dim self.visual_sources = visual_sources self.visual_dim = [visual_sources, *visual_resolution] if visual_sources else [0] self.a_dim_or_list = a_dim_or_list self.gamma = gamma self.max_episode = max_episode self.policy_mode = policy_mode self.batch_size = batch_size self.buffer_size = buffer_size self.use_priority = use_priority self.n_step = n_step self.init_data_memory() def init_data_memory(self): ''' the biggest diffenernce between policy_modes(ON and OFF) is 'OFF' mode need raise the dimension of 'r' and 'done'. 'ON' mode means program will call on_store function and use pandas dataframe to store data. 'OFF' mode will call off_store function and use replay buffer to store data. ''' if self.policy_mode == 'ON': self.data = pd.DataFrame(columns=['s', 'a', 'r', 'done']) elif self.policy_mode == 'OFF': if self.use_priority: if self.n_step: print('N-Step PER') self.data = NStepPrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode, gamma=self.gamma, alpha=er_config['nper_config']['alpha'], beta=er_config['nper_config']['beta'], epsilon=er_config['nper_config']['epsilon'], agents_num=er_config['nper_config']['max_agents'], n=er_config['nper_config']['n'], global_v=er_config['nper_config']['global_v']) else: print('PER') self.data = PrioritizedExperienceReplay(self.batch_size, self.buffer_size, max_episode=self.max_episode, alpha=er_config['per_config']['alpha'], beta=er_config['per_config']['beta'], epsilon=er_config['per_config']['epsilon'], global_v=er_config['nper_config']['global_v']) else: if self.n_step: print('N-Step ER') self.data = NStepExperienceReplay(self.batch_size, self.buffer_size, gamma=self.gamma, agents_num=er_config['ner_config']['max_agents'], n=er_config['ner_config']['n']) else: print('ER') self.data = ExperienceReplay(self.batch_size, self.buffer_size) else: raise Exception('Please specific a mode of policy!') def on_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for on-policy training, use this function to store <s, a, r, s_, done> into DataFrame of Pandas. """ assert isinstance(a, np.ndarray), "on_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "on_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "on_store need done type is np.ndarray" if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data = self.data.append({ 's': s.astype(np.float32), 'visual_s': visual_s.astype(np.float32), 'a': a.astype(np.float32), 'r': r.astype(np.float32), 's_': s_.astype(np.float32), 'visual_s_': visual_s_.astype(np.float32), 'done': done.astype(np.float32) }, ignore_index=True) def off_store(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "off_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "off_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "off_store need done type is np.ndarray" if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add( s.astype(np.float32), visual_s.astype(np.float32), a.astype(np.float32), r.astype(np.float32), s_.astype(np.float32), visual_s_.astype(np.float32), done.astype(np.float32) ) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray" if self.policy_mode == 'OFF': if not self.action_type == 'continuous': a = sth.action_index2one_hot(a, self.a_dim_or_list) self.data.add( s.astype(np.float32), visual_s.astype(np.float32), a.astype(np.float32), r[:, np.newaxis].astype(np.float32), s_.astype(np.float32), visual_s_.astype(np.float32), done[:, np.newaxis].astype(np.float32) ) def clear(self): """ clear the DataFrame. """ self.data.drop(self.data.index, inplace=True) def get_max_episode(self): """ get the max episode of this training model. """ return self.max_episode def get_TensorSpecs(self, *args): """ get all inputs' shape in order to fix the problem of retracting in TF2.0 """ return [tf.TensorSpec(shape=[None] + i, dtype=tf.float32) for i in args] @staticmethod def clip_nn_log_std(log_std, _min=-20, _max=2): """ scale log_std from [-1, 1] to [_min, _max] """ return _min + 0.5 * (_max - _min) * (log_std + 1) @staticmethod def gaussian_reparam_sample(mu, log_std): """ reparameter """ std = tf.exp(log_std) pi = mu + tf.random.normal(mu.shape) * std log_pi = Policy.gaussian_likelihood(pi, mu, log_std) return pi, log_pi @staticmethod def gaussian_likelihood(x, mu, log_std): pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + 1e-8))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1, keepdims=True) @staticmethod def gaussian_entropy(log_std): return tf.reduce_mean(0.5 * (1 + tf.math.log(2 * np.pi * tf.exp(log_std)**2))) @staticmethod def squash_action(pi, log_pi=None): """ enforcing action bounds. squash action to range [-1, 1] and calculate the correct log probability value """ pi = tf.tanh(pi) if log_pi is not None: sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True) log_pi -= sub return pi, log_pi @staticmethod def unsquash_action(mu, pi, log_std): """ desquash action from [-1, 1] to [-inf, inf] """ _pi = tf.atanh(pi) log_pi = Policy.gaussian_likelihood(_pi, mu, log_std) sub = tf.reduce_sum(tf.math.log(Policy.clip_but_pass_gradient(1 - pi**2, l=0, h=1) + 1e-6), axis=1, keepdims=True) log_pi -= sub return log_pi @staticmethod def clip_but_pass_gradient(x, l=-1., h=1.): """ Stole this function from SpinningUp """ clip_up = tf.cast(x > h, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((h - x) * clip_up + (l - x) * clip_low)
class MyPolicy(RL_Policy): """ 实现自己的智能体策略 """ def __init__(self, dim, name='wjs_policy'): super().__init__(dim, name) self.state_dim = dim * dim * 3 self.gamma = 0.99 self.lr = 0.0005 self.data = ExperienceReplay(batch_size = 100, capacity=10000) with self.graph.as_default(): self.pl_s = tf.placeholder(tf.float32, [None, self.state_dim], 'state') self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward') self.pl_s_ = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state') self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done') self.v = self.v_net('v', self.pl_s) self.action = tf.argmax(self.v) self.v_ = self.v_net('v', self.pl_s_) self.predict = tf.stop_gradient(self.pl_r + self.gamma * self.v_ * (1 - self.pl_done)) self.v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.predict)) self.v_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='v') optimizer = tf.train.AdamOptimizer(self.lr) self.train_v = optimizer.minimize(self.v_loss, var_list=self.v_vars, global_step=self.global_step) tf.summary.scalar('LOSS/v_loss', self.v_loss) self.summaries = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer()) def update_offset(self, offset): assert type(offset) == int self.offset = offset def v_net(self, name, input_vector): with tf.variable_scope(name, reuse=tf.AUTO_REUSE): l1 = tf.layers.dense(input_vector, 128, tf.nn.relu, **initKernelAndBias) l2 = tf.layers.dense(l1, 64, tf.nn.relu, **initKernelAndBias) l3 = tf.layers.dense(l2, 32, tf.nn.relu, **initKernelAndBias) v = tf.layers.dense(l3, 1, None, **initKernelAndBias) return v def store(self, **kargs): self.data.add(*kargs.values()) def choose_action(self, state): indexs, all_states = self.get_all_available_actions(state) if np.random.rand() > 0.2: action = self.sess.run(self.action, feed_dict={ self.pl_s: all_states })[0] else: action = np.random.randint(len(indexs)) x, y = indexs[action] % self.dim, indexs[action] // self.dim return x, y def learn(self): try: s, r, s_, done = self.data.sample() summaries, _ = self.sess.run([self.summaries, self.train_v], feed_dict={ self.pl_s: np.eye(3)[s].reshape(s.shape[0],-1), self.pl_r: r[:, np.newaxis], self.pl_s_: np.eye(3)[s_].reshape(s.shape[0],-1), self.pl_done: done[:, np.newaxis] }) self.writer.add_summary(summaries, self.sess.run(self.global_step)) except Exception as e: print(e) return def get_all_available_actions(self, state): assert isinstance(state, np.ndarray), "state不是numpy类型" indexs = [] for i in range(state.shape[0]): if state[i] == 2: indexs.append(i) all_states = [] for i in indexs: a = np.zeros_like(state) a[i] = self.offset all_states.append(state - a) return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])
class MyPolicy(RL_Policy): """ 实现自己的智能体策略 """ def __init__(self, dim, name='wjs_policy'): super().__init__(dim, name) self.state_dim = dim * dim * 3 self.gamma = 0.99 self.lr = 0.0005 self.data = ExperienceReplay(batch_size=100, capacity=10000) self.v_net = V(vector_dim=self.state_dim, name='v_net', hidden_units=[128, 64, 32]) self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr) def update_offset(self, offset): assert isinstance(offset, int) self.offset = offset def store(self, **kargs): self.data.add(*kargs.values()) @tf.function def _get_action(self, state): return tf.argmax(self.v_net(state)) def choose_action(self, state): indexs, all_states = self.get_all_available_actions(state) if np.random.rand() > 0.2: action = self._get_action(all_states)[0] else: action = np.random.randint(len(indexs)) x, y = indexs[action] % self.dim, indexs[action] // self.dim return x, y def learn(self): try: s, r, s_, done = self.data.sample() s = np.eye(3)[s].reshape(s.shape[0], -1) r = r[:, np.newaxis] s_ = np.eye(3)[s_].reshape(s.shape[0], -1) done = done[:, np.newaxis] summaries = self.train(s, r, s_, done) tf.summary.experimental.set_step(self.global_step) self.write_training_summaries(summaries) tf.summary.scalar('LEARNING_RATE/lr', self.lr) self.writer.flush() except Exception as e: print(e) return @tf.function def train(self, s, r, s_, done): with tf.device(self.device): with tf.GradientTape() as tape: v = self.v_net(s) v_ = self.v_net(s_) predict = tf.stop_gradient(r + self.gamma * v_ * (1 - done)) v_loss = tf.reduce_mean((v - predict)**2) grads = tape.gradient(v_loss, self.v_net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.v_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/v_loss', v_loss]]) def get_all_available_actions(self, state): assert isinstance(state, np.ndarray), "state不是numpy类型" indexs = [] for i in range(state.shape[0]): if state[i] == 2: indexs.append(i) all_states = [] for i in indexs: a = np.zeros_like(state) a[i] = self.offset all_states.append(state - a) return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])