class MCTS_POLICY(RL_Policy): def __init__(self, state_dim, learning_rate=5.0e-4, buffer_size=10000, batch_size=128, epochs=2, name='wjs_policy', cp_dir='./models'): super().__init__(cp_dir=cp_dir) self.lr = learning_rate self.epochs = epochs self.data = ExperienceReplay(batch_size=batch_size, capacity=buffer_size) self.net = PV(state_dim=state_dim, name='pv_net') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr) @tf.function def _get_probs_and_v(self, state): with tf.device(self.device): state = tf.transpose(state, [0, 2, 3, 1]) return self.net(state) def get_probs_and_v(self, game): ''' 输入状态,获得相应可选动作的概率和当前节点的预期价值 ''' state = game.get_current_state().reshape(-1, 4, game.box_size, game.box_size) log_actions_prob, value = self._get_probs_and_v(state) actions_prob = np.exp(log_actions_prob) a, b = game.get_available_actions() available_actions_prob = zip(a, actions_prob[0][b]) return available_actions_prob, value def learn(self): if self.data.is_lg_batch_size: s, p, v = self.data.sample() for i in range(self.epochs): summaries = self.train(s, p, v) loss = summaries['LOSS/loss'] logging.info(f'epoch: {i}, loss: {loss}') tf.summary.experimental.set_step(self.global_step) self.write_training_summaries(summaries) tf.summary.scalar('LEARNING_RATE/lr', self.lr) self.writer.flush() @tf.function def train(self, s, p, v): s = tf.cast(s, tf.float32) p = tf.cast(p, tf.float32) v = tf.cast(v, tf.float32) with tf.device(self.device): s = tf.transpose(s, [0, 2, 3, 1]) with tf.GradientTape() as tape: log_action_probs, predict_v = self.net(s) p_loss = -tf.reduce_mean( tf.reduce_sum(tf.multiply(p, log_action_probs), axis=-1)) v_loss = tf.reduce_mean((v - predict_v)**2) l2_penalty = 1e-4 * tf.add_n([ tf.nn.l2_loss(v) for v in self.net.trainable_variables if 'bias' not in v.name.lower() ]) loss = v_loss + p_loss + l2_penalty grads = tape.gradient(loss, self.net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.net.trainable_variables)) self.global_step.assign_add(1) return dict([ ['LOSS/v_loss', v_loss], ['LOSS/p_loss', p_loss], ['LOSS/loss', loss], ]) def store(self, data: list): for i in data: self.data.add(i) def store_in_file(self, data, file_name='./data/data'): with open(f'{file_name}.data', 'a') as f: for i in data: json_str = json.dumps([d.tolist() for d in i]) # 将一条经验转换为list f.write(json_str + '\n') # 保存一条经验 def _restore_from_file(self, data, file_name='./data/data'): with open(f'{file_name}.data') as f: for json_str in f: # 每行为一条经验 if json_str != '': data = json.loads(json_str) data = [np.array(d) for d in data] # 一条经验 self.data.add(data) # 恢复一条经验
class Agent: def __init__(self, env_args: Config, model_args: Config, buffer_args: Config, train_args: Config): # print("89898989") self.env_args = env_args self.model_args = model_args self.buffer_args = buffer_args self.train_args = train_args self.use_GCN = False self.model_index = str(self.train_args.get('index')) self.all_learner_print = bool( self.train_args.get('all_learner_print', False)) if '-' not in self.train_args['name']: self.train_args['name'] += f'-{self.model_index}' if self.model_args['load'] is None: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.train_args['name']) else: if '/' in self.model_args['load'] or '\\' in self.model_args[ 'load']: # 所有训练进程都以该模型路径初始化,绝对路径 self.train_args['load_model_path'] = self.model_args['load'] elif '-' in self.model_args['load']: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load']) # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径 else: # 只写load的训练名称,不用带进程序号,会自动补 self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load'] + f'-{self.model_index}') # ENV self.env = make_env(self.env_args.to_dict, self.use_GCN) # ALGORITHM CONFIG Model, algorithm_config, _policy_mode = get_model_info( self.model_args['algo']) self.model_args['policy_mode'] = _policy_mode if self.model_args['algo_config'] is not None: algorithm_config = UpdateConfig(algorithm_config, self.model_args['algo_config'], 'algo') ShowConfig(algorithm_config) # BUFFER if _policy_mode == 'off-policy': self.buffer_args['batch_size'] = algorithm_config['batch_size'] self.buffer_args['buffer_size'] = algorithm_config['buffer_size'] if self.model_args['algo'] in ['drqn', 'drdqn']: self.buffer_args['type'] = 'EpisodeER' else: _use_priority = algorithm_config.get('use_priority', False) _n_step = algorithm_config.get('n_step', False) if _use_priority and _n_step: self.buffer_args['type'] = 'NstepPER' self.buffer_args['NstepPER'][ 'max_episode'] = self.train_args['max_episode'] self.buffer_args['NstepPER']['gamma'] = algorithm_config[ 'gamma'] algorithm_config['gamma'] = pow( algorithm_config['gamma'], self.buffer_args['NstepPER'] ['n']) # update gamma for n-step training. elif _use_priority: self.buffer_args['type'] = 'PER' self.buffer_args['PER']['max_episode'] = self.train_args[ 'max_episode'] elif _n_step: self.buffer_args['type'] = 'NstepER' self.buffer_args['NstepER']['gamma'] = algorithm_config[ 'gamma'] algorithm_config['gamma'] = pow( algorithm_config['gamma'], self.buffer_args['NstepER']['n']) else: self.buffer_args['type'] = 'ER' else: self.buffer_args['type'] = 'Pandas' # MODEL base_dir = os.path.join( self.train_args['base_dir'], self.train_args['name'] ) # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME if 'batch_size' in algorithm_config.keys() and train_args['fill_in']: self.train_args['pre_fill_steps'] = algorithm_config['batch_size'] if self.env_args['type'] == 'gym': self.eval_env_args = deepcopy(self.env_args) self.eval_env_args.env_num = 1 self.eval_env = make_env(self.eval_env_args.to_dict) # buffer ------------------------------ if 'Nstep' in self.buffer_args[ 'type'] or 'Episode' in self.buffer_args['type']: self.buffer_args[self.buffer_args['type']][ 'agents_num'] = self.env_args['env_num'] self.buffer = get_buffer(self.buffer_args) # buffer ------------------------------ # model ------------------------------- model_params = { 's_dim': self.env.s_dim, 'visual_sources': self.env.visual_sources, 'visual_resolution': self.env.visual_resolution, 'a_dim_or_list': self.env.a_dim_or_list, 'is_continuous': self.env.is_continuous, 'max_episode': self.train_args.max_episode, 'base_dir': base_dir, 'logger2file': self.model_args.logger2file, 'seed': self.model_args.seed } self.model = Model(**model_params, **algorithm_config) self.model.set_buffer(self.buffer) self.model.init_or_restore(self.train_args['load_model_path']) # model ------------------------------- self.train_args['begin_episode'] = self.model.get_init_episode() if not self.train_args['inference']: records_dict = { 'env': self.env_args.to_dict, 'model': self.model_args.to_dict, 'buffer': self.buffer_args.to_dict, 'train': self.train_args.to_dict, 'algo': algorithm_config } save_config(os.path.join(base_dir, 'config'), records_dict) else: # buffer ----------------------------------- self.buffer_args_s = [] for i in range(self.env.brain_num): _bargs = deepcopy(self.buffer_args) if 'Nstep' in _bargs['type'] or 'Episode' in _bargs['type']: _bargs[_bargs['type']][ 'agents_num'] = self.env.brain_agents[i] self.buffer_args_s.append(_bargs) buffers = [ get_buffer(self.buffer_args_s[i]) for i in range(self.env.brain_num) ] # buffer ----------------------------------- # model ------------------------------------ self.model_args_s = [] for i in range(self.env.brain_num): _margs = deepcopy(self.model_args) _margs['seed'] = self.model_args['seed'] + i * 10 self.model_args_s.append(_margs) model_params = [ { 's_dim': self.env.s_dim[i], 'a_dim_or_list': self.env.a_dim_or_list[i], 'visual_sources': self.env.visual_sources[i], 'visual_resolution': self.env.visual_resolutions[i], 'is_continuous': self.env.is_continuous[i], 'max_episode': self.train_args.max_episode, 'base_dir': os.path.join(base_dir, b), 'logger2file': self.model_args_s[i].logger2file, 'seed': self.model_args_s[i]. seed, # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } for i, b in enumerate(self.env.brain_names) ] # multi agent training------------------------------------ if self.model_args['algo'][:3] == 'ma_': self.ma = True assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1' self.ma_data = ExperienceReplay(batch_size=10, capacity=1000) [ mp.update({ 'n': self.env.brain_num, 'i': i }) for i, mp in enumerate(model_params) ] else: self.ma = False # multi agent training------------------------------------ self.models = [ Model(**model_params[i], **algorithm_config) for i in range(self.env.brain_num) ] [ model.set_buffer(buffer) for model, buffer in zip(self.models, buffers) ] [ self.models[i].init_or_restore( os.path.join(self.train_args['load_model_path'], b)) for i, b in enumerate(self.env.brain_names) ] # model ------------------------------------ self.train_args['begin_episode'] = self.models[0].get_init_episode( ) if not self.train_args['inference']: for i, b in enumerate(self.env.brain_names): records_dict = { 'env': self.env_args.to_dict, 'model': self.model_args_s[i].to_dict, 'buffer': self.buffer_args_s[i].to_dict, 'train': self.train_args.to_dict, 'algo': algorithm_config } save_config(os.path.join(base_dir, b, 'config'), records_dict) # print("21323232323") def pwi(self, *args): if self.all_learner_print: print(f'| Model-{self.model_index} |', *args) elif int(self.model_index) == 0: print(f'|#ONLY#Model-{self.model_index} |', *args) def __call__(self): self.train() def train(self): if self.env_args['type'] == 'gym': try: self.gym_no_op() self.gym_train() finally: self.model.close() self.env.close() else: try: if self.ma: self.ma_unity_no_op() self.ma_unity_train() else: self.unity_no_op() self.unity_train() finally: [model.close() for model in self.models] self.env.close() def evaluate(self): if self.env_args['type'] == 'gym': self.gym_inference() else: if self.ma: self.ma_unity_inference() else: self.unity_inference() def init_variables(self, evaluate=False): """ inputs: env: Environment outputs: i: specify which item of state should be modified state: [vector_obs, visual_obs] newstate: [vector_obs, visual_obs] """ if evaluate: env = self.eval_env else: env = self.env i = 1 if env.obs_type == 'visual' else 0 return i, [np.array([[]] * env.n), np.array([[]] * env.n) ], [np.array([[]] * env.n), np.array([[]] * env.n)] def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = bool(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) off_policy_step_eval = bool(self.train_args['off_policy_step_eval']) off_policy_step_eval_num = int( self.train_args.get('off_policy_step_eval_num')) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) total_step_control = bool(self.train_args['total_step_control']) max_total_step = int(self.train_args['max_total_step']) if total_step_control: max_episode = max_total_step i, state, new_state = self.init_variables() sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if off_policy_step_eval: self.gym_step_eval(total_step, self.model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.gym_random_sample(steps=add_noise2buffer_steps) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate() def gym_step_eval(self, idx, model, episodes_num, max_step): i, state, _ = self.init_variables(evaluate=True) ret = 0. ave_steps = 0. for _ in range(episodes_num): state[i] = self.eval_env.reset() r = 0. step = 0 while True: action = model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.eval_env.step(action) reward = reward[0] done = done[0] r += reward step += 1 if done or step > max_step: ret += r ave_steps += step break model.writer_summary( idx, eval_return=ret / episodes_num, eval_ave_step=ave_steps // episodes_num, ) def gym_random_sample(self, steps): i, state, new_state = self.init_variables() state[i] = self.env.reset() for _ in range(steps): action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] self.pwi('Noise added complete.') def gym_evaluate(self): max_step = int(self.train_args['max_step']) max_eval_episode = int(self.train_args['max_eval_eposide']) i, state, _ = self.init_variables() total_r = np.zeros(self.env.n) total_steps = np.zeros(self.env.n) episodes = max_eval_episode // self.env.n for _ in range(episodes): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) steps = np.zeros(self.env.n) r = np.zeros(self.env.n) while True: r_tem = np.zeros(self.env.n) action = self.model.choose_action( s=state[0], visual_s=state[1], evaluation=True ) # In the future, this method can be combined with choose_action state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False) dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] steps[unfinished_index] += 1 r += r_tem if all(dones_flag) or any(steps >= max_step): break total_r += r total_steps += steps average_r = total_r.mean() / episodes average_step = int(total_steps.mean() / episodes) solved = True if average_r >= self.env.reward_threshold else False self.pwi( f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}' ) self.pwi( '----------------------------------------------------------------------------------------------------------------------------' ) def gym_no_op(self): steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' i, state, new_state = self.init_variables() state[i] = self.env.reset() steps = steps // self.env.n + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: action = self.model.choose_action(s=state[0], visual_s=state[1]) else: action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] def gym_inference(self): i, state, _ = self.init_variables() while True: state[i] = self.env.reset() while True: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.env.step(action) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset state[i][self.env.dones_index] = self.env.partial_reset() def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) if self.use_GCN: adj, x, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 6) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) adj[i] = _adj x[i] = _x visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( adj=adj[i], x=x[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] adj[i] = _adj x[i] = _x visual_state[i] = _vs if policy_mode == 'off-policy': # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps) else: state, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 5) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) state[i] = _v visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] state[i] = _v visual_state[i] = _vs if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps) def unity_random_sample(self, steps): if self.use_GCN: adj, x, visual_state = zeros_initializer(self.env.brain_num, 3) ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): adj[i] = _adj x[i] = _x visual_state[i] = _vs for _ in range(steps): action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].store_data_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) adj[i] = _adj x[i] = _x visual_state[i] = _vs self.pwi('Noise added complete.') else: state, visual_state = zeros_initializer(self.env.brain_num, 2) ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v visual_state[i] = _vs for _ in range(steps): action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].store_data(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) state[i] = _v visual_state[i] = _vs self.pwi('Noise added complete.') def unity_no_op(self): ''' Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer. Make sure steps is greater than n-step if using any n-step ReplayBuffer. ''' steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' if self.use_GCN: adj, x, visual_state, action = zeros_initializer( self.env.brain_num, 4) ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): adj[i] = _adj x[i] = _x visual_state[i] = _vs steps = steps // min(self.env.brain_agents) + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( adj=adj[i], x=x, visual_s=visual_state[i]) else: action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) print( "77777777777777777777777777777777777777777777777777777777777777777" ) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].no_op_store_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) adj[i] = _adj x[i] = _x visual_state[i] = _vs else: state, visual_state, action = zeros_initializer( self.env.brain_num, 3) ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v visual_state[i] = _vs steps = steps // min(self.env.brain_agents) + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) else: action = self.env.random_action() actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): self.models[i].no_op_store(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) state[i] = _v visual_state[i] = _vs def unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ if self.use_GCN: action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action( adj=_adj, x=_x, visual_s=_vs, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) else: action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action( s=_v, visual_s=_vs, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) def ma_unity_no_op(self): steps = self.train_args['pre_fill_steps'] choose = self.train_args['prefill_choose'] assert isinstance(steps, int), 'multi-agent no_op.steps must have type of int' if steps < self.ma_data.batch_size: steps = self.ma_data.batch_size state, action, reward, next_state, dones = zeros_initializer( self.env.brain_num, 5) ObsRewDone = self.env.reset(train_mode=False) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): state[i] = _v for i in range(self.env.brain_num): # initialize actions to zeros if self.env.is_continuous[i]: action[i] = np.zeros( (self.env.brain_agents[i], self.env.a_dim_or_list[i][0]), dtype=np.int32) else: action[i] = np.zeros( (self.env.brain_agents[i], len(self.env.a_dim_or_list[i])), dtype=np.int32) a = [np.asarray(e) for e in zip(*action)] for step in range(steps): self.pwi(f'no op step {step}') for i in range(self.env.brain_num): if choose: action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): reward[i] = _r[:, np.newaxis] next_state[i] = _vs dones[i] = _d[:, np.newaxis] def func(x): return [np.asarray(e) for e in zip(*x)] s, a, r, s_, done = map(func, [state, action, reward, next_state, dones]) self.ma_data.add(s, a, r, s_, done) for i in range(self.env.brain_num): state[i] = next_state[i] def ma_unity_train(self): begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only." batch_size = self.ma_data.batch_size state, action, new_action, next_action, reward, next_state, dones, dones_flag, rewards = zeros_initializer( self.env.brain_num, 9) for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) state[i] = _v step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): reward[i] = _r[:, np.newaxis] next_state = _v dones[i] = _d[:, np.newaxis] unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d rewards[i][unfinished_index] += _r[unfinished_index] def func(x): return [np.asarray(e) for e in zip(*x)] s, a, r, s_, done = map( func, [state, action, reward, next_state, dones]) self.ma_data.add(s, a, r, s_, done) for i in range(self.env.brain_num): state[i] = next_state[i] s, a, r, s_, done = self.ma_data.sample() for i, brain_name in enumerate(self.env.brain_names): next_action[i] = self.models[i].get_target_action(s=s_[:, i]) new_action[i] = self.models[i].choose_action( s=s[:, i], evaluation=True) a_ = np.asarray([np.asarray(e) for e in zip(*next_action)]) if policy_mode == 'off-policy': for i in range(self.env.brain_num): self.models[i].learn( episode=episode, ap=np.asarray([ np.asarray(e) for e in zip(*next_action[:i]) ]).reshape(batch_size, -1) if i != 0 else np.zeros( (batch_size, 0)), al=np.asarray([ np.asarray(e) for e in zip( *next_action[-(self.env.brain_num - i - 1):]) ]).reshape(batch_size, -1) if self.env.brain_num - i != 1 else np.zeros( (batch_size, 0)), ss=s.reshape(batch_size, -1), ss_=s_.reshape(batch_size, -1), aa=a.reshape(batch_size, -1), aa_=a_.reshape(batch_size, -1), s=s[:, i], r=r[:, i]) if all([all(dones_flag[i]) for i in range(self.env.brain_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): self.models[i].writer_summary(episode, total_reward=rewards[i].mean(), step=last_done_step) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}' ) if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) def ma_unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ action = zeros_initializer(self.env.brain_num, 1) while True: ObsRewDone = self.env.reset() while True: for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): action[i] = self.models[i].choose_action(s=_v, evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions)
class MyPolicy(RL_Policy): """ 实现自己的智能体策略 """ def __init__(self, dim, name='wjs_policy'): super().__init__(dim, name) self.state_dim = dim * dim * 3 self.gamma = 0.99 self.lr = 0.0005 self.data = ExperienceReplay(batch_size = 100, capacity=10000) with self.graph.as_default(): self.pl_s = tf.placeholder(tf.float32, [None, self.state_dim], 'state') self.pl_r = tf.placeholder(tf.float32, [None, 1], 'reward') self.pl_s_ = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state') self.pl_done = tf.placeholder(tf.float32, [None, 1], 'done') self.v = self.v_net('v', self.pl_s) self.action = tf.argmax(self.v) self.v_ = self.v_net('v', self.pl_s_) self.predict = tf.stop_gradient(self.pl_r + self.gamma * self.v_ * (1 - self.pl_done)) self.v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.predict)) self.v_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='v') optimizer = tf.train.AdamOptimizer(self.lr) self.train_v = optimizer.minimize(self.v_loss, var_list=self.v_vars, global_step=self.global_step) tf.summary.scalar('LOSS/v_loss', self.v_loss) self.summaries = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer()) def update_offset(self, offset): assert type(offset) == int self.offset = offset def v_net(self, name, input_vector): with tf.variable_scope(name, reuse=tf.AUTO_REUSE): l1 = tf.layers.dense(input_vector, 128, tf.nn.relu, **initKernelAndBias) l2 = tf.layers.dense(l1, 64, tf.nn.relu, **initKernelAndBias) l3 = tf.layers.dense(l2, 32, tf.nn.relu, **initKernelAndBias) v = tf.layers.dense(l3, 1, None, **initKernelAndBias) return v def store(self, **kargs): self.data.add(*kargs.values()) def choose_action(self, state): indexs, all_states = self.get_all_available_actions(state) if np.random.rand() > 0.2: action = self.sess.run(self.action, feed_dict={ self.pl_s: all_states })[0] else: action = np.random.randint(len(indexs)) x, y = indexs[action] % self.dim, indexs[action] // self.dim return x, y def learn(self): try: s, r, s_, done = self.data.sample() summaries, _ = self.sess.run([self.summaries, self.train_v], feed_dict={ self.pl_s: np.eye(3)[s].reshape(s.shape[0],-1), self.pl_r: r[:, np.newaxis], self.pl_s_: np.eye(3)[s_].reshape(s.shape[0],-1), self.pl_done: done[:, np.newaxis] }) self.writer.add_summary(summaries, self.sess.run(self.global_step)) except Exception as e: print(e) return def get_all_available_actions(self, state): assert isinstance(state, np.ndarray), "state不是numpy类型" indexs = [] for i in range(state.shape[0]): if state[i] == 2: indexs.append(i) all_states = [] for i in indexs: a = np.zeros_like(state) a[i] = self.offset all_states.append(state - a) return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])
class Agent: def __init__(self, env_args, model_args, buffer_args, train_args): self.env_args = env_args self.model_args = model_args self.buffer_args = buffer_args self.train_args = train_args self.model_index = str(self.train_args.get('index')) self.all_learner_print = bool( self.train_args.get('all_learner_print', False)) self.train_args['name'] += f'-{self.model_index}' if self.model_args['load'] is None: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.train_args['name']) else: if '/' in self.model_args['load'] or '\\' in self.model_args[ 'load']: # 所有训练进程都以该模型路径初始化,绝对路径 self.train_args['load_model_path'] = self.model_args['load'] elif '-' in self.model_args['load']: self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load']) # 指定了名称和序号,所有训练进程都以该模型路径初始化,相对路径 else: # 只写load的训练名称,不用带进程序号,会自动补 self.train_args['load_model_path'] = os.path.join( self.train_args['base_dir'], self.model_args['load'] + f'-{self.model_index}') # ENV self.env = make_env(self.env_args) # ALGORITHM CONFIG Model, algorithm_config, _policy_mode = get_model_info( self.model_args['algo']) self.model_args['policy_mode'] = _policy_mode if self.model_args['algo_config'] is not None: algorithm_config = UpdateConfig(algorithm_config, self.model_args['algo_config'], 'algo') ShowConfig(algorithm_config) # BUFFER if _policy_mode == 'off-policy': self.buffer_args['batch_size'] = algorithm_config['batch_size'] self.buffer_args['buffer_size'] = algorithm_config['buffer_size'] _use_priority = algorithm_config.get('use_priority', False) _n_step = algorithm_config.get('n_step', False) if _use_priority and _n_step: self.buffer_args['type'] = 'NSTEP-PER' self.buffer_args['NSTEP-PER']['max_episode'] = self.train_args[ 'max_episode'] self.buffer_args['NSTEP-PER']['gamma'] = algorithm_config[ 'gamma'] elif _use_priority: self.buffer_args['type'] = 'PER' self.buffer_args['PER']['max_episode'] = self.train_args[ 'max_episode'] elif _n_step: self.buffer_args['type'] = 'NSTEP-ER' self.buffer_args['NSTEP-ER']['gamma'] = algorithm_config[ 'gamma'] else: self.buffer_args['type'] = 'ER' else: self.buffer_args['type'] = 'Pandas' # MODEL base_dir = os.path.join( self.train_args['base_dir'], self.train_args['name'] ) # train_args['base_dir'] DIR/ENV_NAME/ALGORITHM_NAME if 'batch_size' in algorithm_config.keys() and train_args['fill_in']: self.train_args['no_op_steps'] = algorithm_config['batch_size'] else: self.train_args['no_op_steps'] = train_args['no_op_steps'] if self.env_args['type'] == 'gym': # buffer ------------------------------ if 'NSTEP' in self.buffer_args['type']: self.buffer_args[self.buffer_args['type']][ 'agents_num'] = self.env_args['env_num'] self.buffer = get_buffer(self.buffer_args) # buffer ------------------------------ # model ------------------------------- model_params = { 's_dim': self.env.s_dim, 'visual_sources': self.env.visual_sources, 'visual_resolution': self.env.visual_resolution, 'a_dim_or_list': self.env.a_dim_or_list, 'is_continuous': self.env.is_continuous, 'max_episode': self.train_args['max_episode'], 'base_dir': base_dir, 'logger2file': self.model_args['logger2file'], 'seed': self.model_args['seed'] } self.model = Model(**model_params, **algorithm_config) self.model.set_buffer(self.buffer) self.model.init_or_restore( os.path.join(self.train_args['load_model_path'])) # model ------------------------------- self.train_args['begin_episode'] = self.model.get_init_episode() if not self.train_args['inference']: records_dict = { 'env': self.env_args, 'model': self.model_args, 'buffer': self.buffer_args, 'train': self.train_args, 'algo': algorithm_config } save_config(os.path.join(base_dir, 'config'), records_dict) else: # buffer ----------------------------------- self.buffer_args_s = [] for i in range(self.env.brain_num): _bargs = deepcopy(self.buffer_args) if 'NSTEP' in _bargs['type']: _bargs[_bargs['type']][ 'agents_num'] = self.env.brain_agents[i] self.buffer_args_s.append(_bargs) buffers = [ get_buffer(self.buffer_args_s[i]) for i in range(self.env.brain_num) ] # buffer ----------------------------------- # model ------------------------------------ self.model_args_s = [] for i in range(self.env.brain_num): _margs = deepcopy(self.model_args) _margs['seed'] = self.model_args['seed'] + i * 10 self.model_args_s.append(_margs) model_params = [ { 's_dim': self.env.s_dim[i], 'a_dim_or_list': self.env.a_dim_or_list[i], 'visual_sources': self.env.visual_sources[i], 'visual_resolution': self.env.visual_resolutions[i], 'is_continuous': self.env.is_continuous[i], 'max_episode': self.train_args['max_episode'], 'base_dir': os.path.join(base_dir, b), 'logger2file': self.model_args_s[i]['logger2file'], 'seed': self.model_args_s[i] ['seed'], # 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } for i, b in enumerate(self.env.brain_names) ] # multi agent training------------------------------------ if self.model_args['algo'][:3] == 'ma_': self.ma = True assert self.env.brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1' self.ma_data = ExperienceReplay(batch_size=10, capacity=1000) [ mp.update({ 'n': self.env.brain_num, 'i': i }) for i, mp in enumerate(model_params) ] else: self.ma = False # multi agent training------------------------------------ self.models = [ Model(**model_params[i], **algorithm_config) for i in range(self.env.brain_num) ] [ model.set_buffer(buffer) for model, buffer in zip(self.models, buffers) ] [ self.models[i].init_or_restore( os.path.join(self.train_args['load_model_path'], b)) for i, b in enumerate(self.env.brain_names) ] # model ------------------------------------ self.train_args['begin_episode'] = self.models[0].get_init_episode( ) if not self.train_args['inference']: for i, b in enumerate(self.env.brain_names): records_dict = { 'env': self.env_args, 'model': self.model_args_s[i], 'buffer': self.buffer_args_s[i], 'train': self.train_args, 'algo': algorithm_config } save_config(os.path.join(base_dir, b, 'config'), records_dict) pass def pwi(self, *args): if self.all_learner_print: print(f'| Model-{self.model_index} |', *args) elif int(self.model_index) == 0: print(f'|#ONLY#Model-{self.model_index} |', *args) def __call__(self): self.train() def train(self): if self.env_args['type'] == 'gym': try: self.gym_no_op() self.gym_train() finally: self.model.close() self.env.close() else: try: if self.ma: self.ma_unity_no_op() self.ma_unity_train() else: self.unity_no_op() self.unity_train() finally: [model.close() for model in self.models] self.env.close() def evaluate(self): if self.env_args['type'] == 'gym': self.gym_inference() else: if self.ma: self.ma_unity_inference() else: self.unity_inference() def init_variables(self): """ inputs: env: Environment outputs: i: specify which item of state should be modified state: [vector_obs, visual_obs] newstate: [vector_obs, visual_obs] """ i = 1 if self.env.obs_type == 'visual' else 0 return i, [np.array([[]] * self.env.n), np.array([[]] * self.env.n)], [ np.array([[]] * self.env.n), np.array([[]] * self.env.n) ] def get_visual_input(self, n, cameras, brain_obs): ''' inputs: n: agents number cameras: camera number brain_obs: observations of specified brain, include visual and vector observation. output: [vector_information, [visual_info0, visual_info1, visual_info2, ...]] ''' ss = [] for j in range(n): s = [] for k in range(cameras): s.append(brain_obs.visual_observations[k][j]) ss.append(np.array(s)) return np.array(ss) def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = int(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) policy_mode = str(self.model_args['policy_mode']) i, state, new_state = self.init_variables() sma = SMA(100) for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate() def gym_evaluate(self): max_step = int(self.train_args['max_step']) max_eval_episode = int(self.train_args['max_eval_eposide']) i, state, _ = self.init_variables() total_r = np.zeros(self.env.n) total_steps = np.zeros(self.env.n) episodes = max_eval_episode // self.env.n for _ in range(episodes): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) steps = np.zeros(self.env.n) r = np.zeros(self.env.n) while True: r_tem = np.zeros(self.env.n) action = self.model.choose_action( s=state[0], visual_s=state[1], evaluation=True ) # In the future, this method can be combined with choose_action state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False) dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] steps[unfinished_index] += 1 r += r_tem if all(dones_flag) or any(steps >= max_step): break total_r += r total_steps += steps average_r = total_r.mean() / episodes average_step = int(total_steps.mean() / episodes) solved = True if average_r >= self.env.reward_threshold else False self.pwi( f'evaluate number: {max_eval_episode:3d} | average step: {average_step} | average reward: {average_r} | SOLVED: {solved}' ) self.pwi( '----------------------------------------------------------------------------------------------------------------------------' ) def gym_no_op(self): steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' i, state, new_state = self.init_variables() state[i] = self.env.reset() steps = steps // self.env.n + 1 for step in range(steps): self.pwi(f'no op step {step}') if choose: action = self.model.choose_action(s=state[0], visual_s=state[1]) else: action = self.env.sample_actions() new_state[i], reward, done, info = self.env.step(action) self.model.no_op_store(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] def gym_inference(self): i, state, _ = self.init_variables() while True: state[i] = self.env.reset() while True: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1], evaluation=True) state[i], reward, done, info = self.env.step(action) if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset state[i][self.env.dones_index] = self.env.partial_reset() def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. agents_num: use to record 'number' of agents for each brain. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num action = [0] * brains_num dones_flag = [0] * brains_num agents_num = [0] * brains_num rewards = [0] * brains_num sma = [SMA(100) for i in range(brains_num)] for episode in range(begin_episode, max_episode): obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) dones_flag[i] = np.zeros(agents_num[i]) rewards[i] = np.zeros(agents_num[i]) step = 0 last_done_step = -1 while True: step += 1 for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += obs[brain_name].local_done next_state = obs[brain_name].vector_observations next_visual_state = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) self.models[i].store_data( s=state[i], visual_s=visual_state[i], a=action[i], r=np.asarray(obs[brain_name].rewards), s_=next_state, visual_s_=next_visual_state, done=np.asarray(obs[brain_name].local_done)) rewards[i][unfinished_index] += np.asarray( obs[brain_name].rewards)[unfinished_index] if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([all(dones_flag[i]) for i in range(brains_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(brains_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary(episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(brains_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(brains_num): self.models[i].save_checkpoint(episode) def unity_no_op(self): ''' Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer. Make sure steps is greater than n-step if using any n-step ReplayBuffer. ''' steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance( steps, int ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0' brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num agents_num = [0] * brains_num action = [0] * brains_num obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): # initialize actions to zeros agents_num[i] = len(obs[brain_name].agents) if self.env.brains[ brain_name].vector_action_space_type == 'continuous': action[i] = np.zeros( (agents_num[i], self.env.brains[brain_name].vector_action_space_size[0]), dtype=np.int32) else: action[i] = np.zeros(( agents_num[i], len(self.env.brains[brain_name].vector_action_space_size)), dtype=np.int32) steps = steps // min(agents_num) + 1 for step in range(steps): self.pwi(f'no op step {step}') for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) if choose: action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): next_state = obs[brain_name].vector_observations next_visual_state = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) self.models[i].no_op_store( s=state[i], visual_s=visual_state[i], a=action[i], r=np.asarray(obs[brain_name].rewards), s_=next_state, visual_s_=next_visual_state, done=np.asarray(obs[brain_name].local_done)) def unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num action = [0] * brains_num agents_num = [0] * brains_num while True: obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) while True: for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.modes[i].visual_sources, obs[brain_name]) action[i] = self.modes[i].choose_action( s=state[i], visual_s=visual_state[i], evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) def ma_unity_no_op(self): steps = self.train_args['no_op_steps'] choose = self.train_args['no_op_choose'] assert isinstance(steps, int), 'multi-agent no_op.steps must have type of int' if steps < self.ma_data.batch_size: steps = self.ma_data.batch_size brains_num = len(self.env.brain_names) agents_num = [0] * brains_num state = [0] * brains_num action = [0] * brains_num reward = [0] * brains_num next_state = [0] * brains_num dones = [0] * brains_num obs = self.env.reset(train_mode=False) for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) if self.env.brains[ brain_name].vector_action_space_type == 'continuous': action[i] = np.zeros( (agents_num[i], self.env.brains[brain_name].vector_action_space_size[0]), dtype=np.int32) else: action[i] = np.zeros(( agents_num[i], len(self.env.brains[brain_name].vector_action_space_size)), dtype=np.int32) a = [np.asarray(e) for e in zip(*action)] for step in range(steps): print(f'no op step {step}') for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations if choose: action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): reward[i] = np.asarray(obs[brain_name].rewards)[:, np.newaxis] next_state[i] = obs[brain_name].vector_observations dones[i] = np.asarray(obs[brain_name].local_done)[:, np.newaxis] s = [np.asarray(e) for e in zip(*state)] a = [np.asarray(e) for e in zip(*action)] r = [np.asarray(e) for e in zip(*reward)] s_ = [np.asarray(e) for e in zip(*next_state)] done = [np.asarray(e) for e in zip(*dones)] self.ma_data.add(s, a, r, s_, done) def ma_unity_train(self): begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only." brains_num = len(self.env.brain_names) batch_size = self.ma_data.batch_size agents_num = [0] * brains_num state = [0] * brains_num action = [0] * brains_num new_action = [0] * brains_num next_action = [0] * brains_num reward = [0] * brains_num next_state = [0] * brains_num dones = [0] * brains_num dones_flag = [0] * brains_num rewards = [0] * brains_num for episode in range(begin_episode, max_episode): obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) dones_flag[i] = np.zeros(agents_num[i]) rewards[i] = np.zeros(agents_num[i]) step = 0 last_done_step = -1 while True: step += 1 for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations action[i] = self.models[i].choose_action(s=state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): reward[i] = np.asarray(obs[brain_name].rewards)[:, np.newaxis] next_state[i] = obs[brain_name].vector_observations dones[i] = np.asarray( obs[brain_name].local_done)[:, np.newaxis] unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += obs[brain_name].local_done rewards[i][unfinished_index] += np.asarray( obs[brain_name].rewards)[unfinished_index] s = [np.asarray(e) for e in zip(*state)] a = [np.asarray(e) for e in zip(*action)] r = [np.asarray(e) for e in zip(*reward)] s_ = [np.asarray(e) for e in zip(*next_state)] done = [np.asarray(e) for e in zip(*dones)] self.ma_data.add(s, a, r, s_, done) s, a, r, s_, done = self.ma_data.sample() for i, brain_name in enumerate(self.env.brain_names): next_action[i] = self.models[i].get_target_action(s=s_[:, i]) new_action[i] = self.models[i].choose_action( s=s[:, i], evaluation=True) a_ = np.asarray([np.asarray(e) for e in zip(*next_action)]) if policy_mode == 'off-policy': for i in range(brains_num): self.models[i].learn( episode=episode, ap=np.asarray([ np.asarray(e) for e in zip(*next_action[:i]) ]).reshape(batch_size, -1) if i != 0 else np.zeros( (batch_size, 0)), al=np.asarray([ np.asarray(e) for e in zip( *next_action[-(brains_num - i - 1):]) ]).reshape(batch_size, -1) if brains_num - i != 1 else np.zeros( (batch_size, 0)), ss=s.reshape(batch_size, -1), ss_=s_.reshape(batch_size, -1), aa=a.reshape(batch_size, -1), aa_=a_.reshape(batch_size, -1), s=s[:, i], r=r[:, i]) if all([all(dones_flag[i]) for i in range(brains_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break # if train_mode == 'perEpisode': # for i in range(brains_num): # self.models[i].learn(episode) for i in range(brains_num): self.models[i].writer_summary(episode, total_reward=rewards[i].mean(), step=last_done_step) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}' ) if episode % save_frequency == 0: for i in range(brains_num): self.models[i].save_checkpoint(episode) def ma_unity_inference(self): """ inference mode. algorithm model will not be train, only used to show agents' behavior """ brains_num = len(self.env.brain_names) state = [0] * brains_num action = [0] * brains_num while True: obs = self.env.reset() while True: for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations action[i] = self.models[i].choose_action(s=state[i], evaluation=True) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions)
class MyPolicy(RL_Policy): """ 实现自己的智能体策略 """ def __init__(self, dim, name='wjs_policy'): super().__init__(dim, name) self.state_dim = dim * dim * 3 self.gamma = 0.99 self.lr = 0.0005 self.data = ExperienceReplay(batch_size=100, capacity=10000) self.v_net = V(vector_dim=self.state_dim, name='v_net', hidden_units=[128, 64, 32]) self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr) def update_offset(self, offset): assert isinstance(offset, int) self.offset = offset def store(self, **kargs): self.data.add(*kargs.values()) @tf.function def _get_action(self, state): return tf.argmax(self.v_net(state)) def choose_action(self, state): indexs, all_states = self.get_all_available_actions(state) if np.random.rand() > 0.2: action = self._get_action(all_states)[0] else: action = np.random.randint(len(indexs)) x, y = indexs[action] % self.dim, indexs[action] // self.dim return x, y def learn(self): try: s, r, s_, done = self.data.sample() s = np.eye(3)[s].reshape(s.shape[0], -1) r = r[:, np.newaxis] s_ = np.eye(3)[s_].reshape(s.shape[0], -1) done = done[:, np.newaxis] summaries = self.train(s, r, s_, done) tf.summary.experimental.set_step(self.global_step) self.write_training_summaries(summaries) tf.summary.scalar('LEARNING_RATE/lr', self.lr) self.writer.flush() except Exception as e: print(e) return @tf.function def train(self, s, r, s_, done): with tf.device(self.device): with tf.GradientTape() as tape: v = self.v_net(s) v_ = self.v_net(s_) predict = tf.stop_gradient(r + self.gamma * v_ * (1 - done)) v_loss = tf.reduce_mean((v - predict)**2) grads = tape.gradient(v_loss, self.v_net.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.v_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/v_loss', v_loss]]) def get_all_available_actions(self, state): assert isinstance(state, np.ndarray), "state不是numpy类型" indexs = [] for i in range(state.shape[0]): if state[i] == 2: indexs.append(i) all_states = [] for i in indexs: a = np.zeros_like(state) a[i] = self.offset all_states.append(state - a) return indexs, np.array([np.eye(3)[i].reshape(-1) for i in all_states])