def run_episodes(sess, env, n_eps, n_steps, render, obs_in, pi_out, pi_logits_out, rnn_state_in, rnn_state_out, predict_ac_op,f, seed): logger = logging.getLogger(__name__) ep_length = [] ep_return = [] logger.info('---------------- Episode results -----------------------') for i in range(0, n_eps): # TODO parallelize this here! Problem: guarantee same sequence of random numbers in each parallel process. --> Solution Index based RNG instead of sequential seed based RNG obs = env.reset() obs = normalize_obs(obs) done = False if rnn_state_in is not None: if len(rnn_state_in) > 1: rnn_s_in = (np.zeros(rnn_state_in[0].shape), np.zeros(rnn_state_in[1].shape)) # init lstm cell vector else: rnn_s_in = np.zeros(len(rnn_state_in)) # init gru cell vector total_return = 0 total_length = -1 reward = 0 i_sample = 0 if f is not None: rew_traj = [] while not done and (i_sample < n_steps): i_sample += 1 total_length += 1 total_return += reward # add reward of previous step, s.t. termination reward is not added anymore. if rnn_state_in is not None: pi, pi_log, act, rnn_s_out = sess.run([pi_out, pi_logits_out, predict_ac_op, rnn_state_out], feed_dict={obs_in[0]: [obs], rnn_state_in: rnn_s_in}) else: pi, pi_log, act = sess.run([pi_out, pi_logits_out, predict_ac_op], feed_dict={obs_in[0]: [obs]}) ac = np.argmax(pi_log) obs, reward, done, _ = env.step(ac) obs = normalize_obs(obs) if f is not None: rew_traj.append(reward) if render: env.render() if rnn_state_in is not None: rnn_s_in = rnn_s_out logger.info('Episode %s: %s, %s' % (i, total_return, total_length)) ep_length.append(total_length) ep_return.append(total_return) if f is not None: with open(f, "a") as csvfile: writer = csv.writer(csvfile) rew_traj[0:0] = [seed, i, np.mean(rew_traj)] writer.writerow(rew_traj) return ep_return
def test_run(env, n_eps, n_pipes): self.logger.info('Evaluating current agent') ep_return = [] ep_length = [] for i in range(0, n_eps): obs = env.reset() obs = normalize_obs(obs) done = False if eval_model.initial_state is not None: if len(eval_model.initial_state) > 1: rnn_s_in = (np.zeros( eval_model.initial_state[0].shape), np.zeros(eval_model.initial_state[1].shape) ) # init lstm cell vector else: rnn_s_in = np.zeros(eval_model.initial_state.shape ) # init gru cell vector total_return = 0 total_length = 0 while not done and (total_return < n_pipes): # self.logger.info(total_return) if eval_model.initial_state is not None: pi, pi_log, act, rnn_s_out = sess.run( [ eval_model.pi, eval_model.pi_logit, eval_model.ac, eval_model.rnn_state_out ], feed_dict={ eval_model.X: [obs], eval_model.rnn_state_in: rnn_s_in }) else: pi, pi_log, act = sess.run([ eval_model.pi, eval_model.pi_logit, eval_model.ac ], feed_dict={ eval_model.X: [obs] }) ac = np.argmax(pi_log) obs, reward, done, _ = env.step(ac) obs = normalize_obs(obs) total_length += 1 total_return += reward if eval_model.initial_state is not None: rnn_s_in = rnn_s_out self.logger.info('Episode %s: %s, %s' % (i, total_return, total_length)) ep_length.append(total_length) ep_return.append(total_return) return ep_return
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_rawrewards = [],[],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, pi, values, states, _ = self.model.step( self.obs, self.states) # , self.dones) ? mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) obs = normalize_obs(obs) self.logger.debug('Observations: %s' % obs) # render only every i-th episode if self.show_interval != 0: if (self.ep_idx[0] % self.show_interval) == 0: self.env.render() self.eplength = [ self.eplength[i] + 1 for i in range(self.nenv) ] # Todo use already implemented functions in run_ple_utils!!! self.epreturn = [ self.epreturn[i] + rewards[i] for i in range(self.nenv) ] [ self.reward_window[i].append(rewards[i]) for i in range(self.nenv) ] # Check for terminal states in every env for i, done in enumerate(dones): # i -> environment ID if done: self.ep_idx[i] += 1 self.obs[i] = self.obs[i] * 0 # update tensorboard summary if self.summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/environment%s/episode_length' % i, simple_value=self.eplength[i]) summary.value.add( tag='envs/environment%s/episode_reward' % i, simple_value=self.epreturn[i]) self.summary_writer.add_summary( summary, self.ep_idx[i]) #self.global_step.eval()) self.summary_writer.flush() # self.retbuffer.append(self.epreturn[i]) if self.epreturn[i] > self.return_threshold: self.return_threshold = self.epreturn[i] self.logger.info('Save model at max reward %s' % self.return_threshold) self.model.save('inter_model') self.eplength[i] = 0 self.epreturn[i] = 0 # # Is not necessary, as the environment is continuous now! # # Reset RNN state vector to 0 if previous sample is a terminating one. # # As no history should be used in rnn training then. # if states: # env_was_done = False # for i, done in enumerate(self.dones): # if done and not env_was_done: # env_was_done = True # c_new = states[0] # h_new = states[1] # c_new[i] = np.zeros_like(c_new[i]) # h_new[i] = np.zeros_like(h_new[i]) # elif done: # c_new[i] = np.zeros_like(c_new[i]) # h_new[i] = np.zeros_like(h_new[i]) # if env_was_done: # states = tf.contrib.rnn.LSTMStateTuple(c_new, h_new) # # print(states) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) # mb_masks = mb_dones[:, :-1] ? mb_rawrewards = np.copy(mb_rewards) mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) self.logger.debug('Discounted rewards: %s' % rewards) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() self.logger.debug('Actions: %s' % mb_actions) self.logger.debug('Q values: %s' % mb_values) self.logger.debug('Observations: %s' % mb_obs) return mb_obs, mb_states, mb_rewards, mb_actions, mb_values, self.reward_window, mb_rawrewards # self.avg_return_n_episodes
def run_episodes(sess, env, n_eps, n_steps, render, obs_in, rnn_state_in, rnn_state_out, predQ_out, f, seed): logger = logging.getLogger(__name__) ep_length = [] ep_return = [] logger.info('---------------- Episode results -----------------------') for i in range(0, n_eps): # TODO parallelize this here! obs = env.reset() obs = normalize_obs(obs) done = False if rnn_state_in is not None: if len(rnn_state_in) > 1: rnn_s_in = (np.zeros(rnn_state_in[0].shape), np.zeros(rnn_state_in[1].shape) ) # init lstm cell vector else: rnn_s_in = np.zeros(len(rnn_state_in)) # init gru cell vector total_return = 0 total_length = -1 reward = 0 i_sample = 0 if f is not None: rew_traj = [] while not done and (i_sample < n_steps): i_sample += 1 total_length += 1 total_return += reward # add reward of previous step, s.t. termination reward is not added anymore. if rnn_state_in is not None: pQ, rnn_s_out = sess.run([predQ_out, rnn_state_out], feed_dict={ obs_in[0]: [obs], rnn_state_in: rnn_s_in }) else: pQ = sess.run([predQ_out], feed_dict={obs_in[0]: [obs]}) best_ac = np.argmax(pQ) # greedy policy not epsilon greedy policy obs, reward, done, _ = env.step(best_ac) # obs, reward, done, _ = env.step(act[0][0]) obs = normalize_obs(obs) if f is not None: rew_traj.append(reward) if render: env.render() if rnn_state_in is not None: rnn_s_in = rnn_s_out logger.info('Episode %s: %s, %s' % (i, total_return, total_length)) ep_length.append(total_length) ep_return.append(total_return) if f is not None: with open(f, "a") as csvfile: writer = csv.writer(csvfile) rew_traj[0:0] = [seed, i, np.mean(rew_traj)] writer.writerow(rew_traj) return ep_return
def q_learning(q_network, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.95, epsilon=0.4, epsilon_decay=.95, buffer_size=4000, batch_size=128, trace_length=32, tau=0.99, update_interval=30, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): # """ # Q-Learning algorithm for off-policy TD control using Function Approximation. # Finds the optimal greedy policy while following an epsilon-greedy policy. # Implements the options of online learning or using experience replay and also # target calculation by target networks, depending on the flags. You can reuse # your Q-learning implementation of the last exercise. # # Args: # env: PLE game # approx: Action-Value function estimator # num_episodes: Number of episodes to run for. # max_time_per_episode: maximum number of time steps before episode is terminated # discount_factor: gamma, discount factor of future rewards. # epsilon: Chance to sample a random action. Float betwen 0 and 1. # epsilon_decay: decay rate of epsilon parameter # use_experience_replay: Indicator if experience replay should be used. # batch_size: Number of samples per batch. # target: Slowly updated target network to calculate the targets. Ignored if None. # # Returns: # An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. # """ logger = logging.getLogger(__name__) # logger.info(datetime.time) tf.reset_default_graph() set_global_seeds(seed) # Params ob_space = env.observation_space ac_space = env.action_space nd, = ob_space.shape n_ac = ac_space.n # Create learning agent and the replay buffer agent = DQNAgent(q_network=q_network, ob_space=ob_space, ac_space=ac_space, lr=lr, max_grad_norm=max_grad_norm, units_per_hlayer=units_per_hlayer, activ_fcn=activ_fcn, log_interval=log_interval, logdir=logdir, batch_size=batch_size, trace_length=trace_length, update_interval=update_interval, tau=tau, keep_model=keep_model) summary_writer = agent.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None replay_buffer = ReplayBuffer(buffer_size) # Keeps track of useful statistics stats = EpisodeStats if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters logger.info('load %s' % os.path.join(logdir, el[:-5])) agent.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. agent.sess.run(agent.global_step.assign(0)) # ------------------ TRAINING -------------------------------------------- logger.info("Start Training") early_stopped = False i_episode, i_sample, i_train = 0, 0, 0 len, rew = 0, 0 horizon = 100 reward_window = deque(maxlen=horizon) avg_rm = deque(maxlen=30) nbatch = batch_size * trace_length return_threshold = -0.05 # 40 # Reset envnn obs = env.reset() obs = normalize_obs(obs) done = False rnn_state0 = agent.step_initial_state if rnn_state0 is None: # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences. trace_length = 1 # Set the target network to be equal to the primary network agent.update_target(agent.target_ops) while i_sample < total_timesteps: if np.random.rand(1) < epsilon: _, next_rnn_state = agent.step([obs], rnn_state0) # epsilon greedy action action = np.random.randint(0, n_ac) else: AP, next_rnn_state = agent.step( [obs], rnn_state0) # epsilon greedy action action = AP[0] next_obs, reward, done, _ = env.step(action) next_obs = normalize_obs(next_obs) i_sample += 1 # render only every i-th episode if show_interval != 0: if i_episode % show_interval == 0: env.render() len += 1 rew += reward reward_window.append(reward) # When episode is done, add episode information to tensorboard summary and stats if done: # env.game_over(): next_obs = list(np.zeros_like(next_obs, dtype=np.float32)) stats['episode_lengths'].append(len) stats['episode_rewards'].append(rew) if summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/ep_return', simple_value=stats['episode_rewards'][i_episode]) summary.value.add( tag="envs/ep_length", simple_value=stats['episode_lengths'][i_episode]) summary_writer.add_summary(summary, i_episode) summary_writer.flush() if save_model and rew > return_threshold: return_threshold = rew logger.info('Save model at max reward %s' % return_threshold) agent.save('inter_model') i_episode += 1 len, rew = 0, 0 # Update replay buffer replay_buffer.add_transition(obs, action, next_obs, reward, done) if save_traj: rew_traj.append(reward) # Update model parameters every #update_interval steps. Use real experience and replayed experience. if replay_buffer.size() > nbatch and (i_sample % update_interval == 0): if (env.spec._env_name == 'ContFlappyBird'): rm = sum(reward_window) / horizon if summary_writer is not None: s_summary = tf.Summary() s_summary.value.add(tag='envs/isample_return', simple_value=rm) summary_writer.add_summary(s_summary, i_sample) summary_writer.flush() if save_model and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) agent.save('inter_model') avg_rm.append(rm) if early_stop: if (i_sample > 60000) and (i_sample <= (60000 + update_interval)): if (sum(avg_rm) / 30) <= -0.88: print('breaked') early_stopped = True break agent.update_target(agent.target_ops) # reset rnn state (history knowledge) before every training step rnn_state_train = agent.train_initial_state # Sample training mini-batch from replay buffer if rnn_state_train is not None: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length) else: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch(batch_size) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. # mb_next_obs = np.reshape(mb_next_obs, (-1, nd)) mb_next_q_values, _ = agent.target_model.predict( mb_next_obs, rnn_state_train) mb_best_next_action = np.argmax(mb_next_q_values, axis=1) mb_td_target = [ mb_rewards[j] + gamma * mb_next_q_values[j][mb_best_next_action[j]] for j in range(nbatch) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(mb_obs, mb_actions, mb_td_target, rnn_state_train) i_train += 1 # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates if test_interval > 0 and i_train > 0 and (i_train % test_interval == 0): ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000) with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) if done: # Reset the model next_obs = env.reset() next_obs = normalize_obs(next_obs) epsilon *= epsilon_decay obs = next_obs rnn_state0 = next_rnn_state # Save final model when training is finished. if save_model: agent.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: logger.info('Save reward trajectory to %s' % rew_results_path) with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return early_stopped, i_sample
def evaluate(args, seed, policies_list, ob_rms=None, render=False, env=None, master=None, render_attn=True): """ RL evaluation: supports eval through training code as well as independently policies_list should be a list of policies of all the agents; len(policies_list) = num agents """ if env is None or master is None: # if any one of them is None, generate both of them master, env = setup_master(args, return_env=True) if seed is None: # ensure env eval seed is different from training seed seed = np.random.randint(0, 100000) print("Evaluation Seed: ", seed) env.seed(seed) if ob_rms is not None: obs_mean, obs_std = ob_rms else: obs_mean = None obs_std = None master.load_models(policies_list) master.set_eval_mode() num_eval_episodes = args.num_eval_episodes all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0) per_step_rewards = np.full((num_eval_episodes, env.n), 0.0) # TODO: provide support for recurrent policies and mask recurrent_hidden_states = None mask = None # world.dists at the end of episode for simple_spread final_min_dists = [] num_success = 0 episode_length = 0 for t in range(num_eval_episodes): obs = env.reset() obs = normalize_obs(obs, obs_mean, obs_std) done = [False] * env.n episode_rewards = np.full(env.n, 0.0) episode_steps = 0 if render: attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render(attn=attn) while not np.all(done): actions = [] with torch.no_grad(): actions = master.eval_act(obs, recurrent_hidden_states, mask) episode_steps += 1 obs, reward, done, info = env.step(actions) obs = normalize_obs(obs, obs_mean, obs_std) episode_rewards += np.array(reward) if render: attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render(attn=attn) if args.record_video: time.sleep(0.08) per_step_rewards[t] = episode_rewards / episode_steps num_success += info['n'][0]['is_success'] episode_length = (episode_length * t + info['n'][0]['world_steps']) / (t + 1) # for simple spread env only if args.env_name == 'simple_spread': final_min_dists.append(env.world.min_dists) elif args.env_name == 'simple_formation' or args.env_name == 'simple_line': final_min_dists.append(env.world.dists) if render: print( "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}" .format(t, info['n'][0]['is_success'], per_step_rewards[t][0], info['n'][0]['world_steps'])) all_episode_rewards[ t, :] = episode_rewards # all_episode_rewards shape: num_eval_episodes x num agents if args.record_video: # print(attn) input('Press enter to continue: ') return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] mb_states = self.states for _ in range(self.nsteps): actions, pi, values, self.states, neglogpacs = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) obs, rewards, self.dones, _ = self.env.step(actions) self.obs[:] = normalize_obs(obs) mb_rewards.append(rewards) self.logger.debug('Observations: %s' % self.obs) # render only every i-th episode if self.show_interval != 0: if (self.ep_idx[0] % self.show_interval) == 0: self.env.render() self.eplength = [ self.eplength[i] + 1 for i in range(self.nenv) ] # Todo use already implemented functions in run_ple_utils!!! self.epreturn = [ self.epreturn[i] + rewards[i] for i in range(self.nenv) ] [ self.reward_window[i].append(rewards[i]) for i in range(self.nenv) ] # Check for terminal states in every env - this is only used in terminating version of FlappyBird for i, done in enumerate(self.dones): # i -> environment ID if done: self.ep_idx[i] += 1 self.obs[i] = self.obs[i] * 0 # update tensorboard summary if self.summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/environment%s/episode_length' % i, simple_value=self.eplength[i]) summary.value.add( tag='envs/environment%s/episode_reward' % i, simple_value=self.epreturn[i]) self.summary_writer.add_summary( summary, self.ep_idx[i]) # self.global_step.eval()) self.summary_writer.flush() # self.retbuffer.append(self.epreturn[i]) if self.epreturn[i] > self.return_threshold: self.return_threshold = self.epreturn[i] self.logger.info('Save model at max reward %s' % self.return_threshold) self.model.save('inter_model') self.eplength[i] = 0 self.epreturn[i] = 0 # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray( mb_neglogpacs, dtype=np.float32 ) # TODO this is an array of tensors, output values instead!! mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(self.obs, self.states, self.dones) # discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.gamma * nextvalues * nextnonterminal - mb_values[ t] # 1-step td-error mb_advs[ t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values self.logger.debug('Actions: %s' % mb_actions) self.logger.debug('Q values: %s' % mb_values) # self.logger.debug('Done mask: %s' % mb_masks) # ? self.logger.debug('Observations: %s' % mb_obs) return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, self.reward_window, mb_rewards)
def evaluate(args, seed, policies_list, ob_rms=None, render=False, env=None, master=None, render_attn=True): """ RL evaluation: 训练时或者单独使用均可 policies_list 是所有agent策略的list; len(policies_list) = 智能体数量 """ import numpy as np import torch from arguments import get_args from utils import normalize_obs from learner import setup_master import time if env is None or master is None: # 其中一个为空则两个都生成 master, env = setup_master(args, return_env=True) if seed is None: seed = np.random.randint(0, 100000) print("Evaluation Seed: ", seed) env.seed(seed) if ob_rms is not None: obs_mean, obs_std = ob_rms else: obs_mean = None obs_std = None master.load_models(policies_list) master.set_eval_mode() num_eval_episodes = args.num_eval_episodes all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0) per_step_rewards = np.full((num_eval_episodes, env.n), 0.0) recurrent_hidden_states = None mask = None # simple_spread 回合结束时 world.dists final_min_dists = [] num_success = 0 episode_length = 0 for t in range(num_eval_episodes): obs = env.reset() obs = normalize_obs(obs, obs_mean, obs_std) done = [False] * env.n episode_rewards = np.full(env.n, 0.0) episode_steps = 0 if render: attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render(attn=attn) while not np.all(done): actions = [] with torch.no_grad(): actions = master.eval_act(obs, recurrent_hidden_states, mask) episode_steps += 1 obs, reward, done, info = env.step(actions) obs = normalize_obs(obs, obs_mean, obs_std) episode_rewards += np.array(reward) if render: attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render(attn=attn) if args.record_video: time.sleep(0.08) per_step_rewards[t] = episode_rewards / episode_steps num_success += info['n'][0]['is_success'] episode_length = (episode_length * t + info['n'][0]['world_steps']) / (t + 1) # simple spread if args.env_name == 'simple_spread': final_min_dists.append(env.world.min_dists) elif args.env_name == 'simple_formation' or args.env_name == 'simple_line': final_min_dists.append(env.world.dists) if render: print( "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}" .format(t, info['n'][0]['is_success'], per_step_rewards[t][0], info['n'][0]['world_steps'])) all_episode_rewards[ t, :] = episode_rewards # all_episode_rewards shape: num_eval_episodes x num agents if args.record_video: # print(attn) input('Press enter to continue: ') return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
pi_logits_out = tf.get_collection('pi_logit') predict_vf_op = tf.get_collection('val') predict_ac_op = tf.get_collection('step') rnn_state_in, rnn_state_out = None, None env = ple_env pi_out = probs_out logger = logging.getLogger(__name__) ep_length = [] ep_return = [] logger.info('---------------- Episode results -----------------------') for i in range(0, 2): # TODO parallelize this here! Problem: guarantee same sequence of random numbers in each parallel process. --> Solution Index based RNG instead of sequential seed based RNG obs = env.reset() obs = normalize_obs(obs) done = False i_sample = 0 while not done and (i_sample < 5000): i_sample += 1 pi, pi_log, act = sess.run([pi_out, pi_logits_out, predict_ac_op], feed_dict={obs_in[0]: [obs]}) ac = np.argmax(pi_log) obs, reward, done, _ = env.step(ac) # obs, reward, done, _ = env.step(act[0][0]) obs = normalize_obs(obs) env.render() time.sleep(0.01)