def __init__(self, num_envs=1, log_dir="", suffix=""): self.resized_dim = 42 env = make_envs(num_envs=1, resized_dim=self.resized_dim) self.obs_shape = env.observation_space.shape self.agent = PPOTrainer(env, ppo_config) if log_dir: # log_dir is None only in testing self.agent.load_w(log_dir, suffix) self.num_envs = num_envs self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4, self.agent.device)
class PolicyAPI: """ This class wrap an agent into a callable function that return action given an raw observation or a batch of raw observations from environment. This function maintain a frame stacker so that the user can securely use it. A reset function is provided so user can refresh the frame stacker when an episode is ended. Note that if you have implement other arbitrary custom agent, you are welcomed to implement a function-like API by yourself. You can write another API function or class and replace this one used in evaluation or even training. Your custom agent may have different network structure and different preprocess techniques. Remember that the API take the raw observation with shape (num_envs, 1, 42, 42) as input and return an single or a batch of integer(s) as action in [0, 1, 2]. Custom agent worth plenty of extra credits! """ def __init__(self, num_envs=1, log_dir="", suffix=""): self.resized_dim = 42 env = make_envs(num_envs=1, resized_dim=self.resized_dim) self.obs_shape = env.observation_space.shape self.agent = PPOTrainer(env, ppo_config) if log_dir: # log_dir is None only in testing self.agent.load_w(log_dir, suffix) self.num_envs = num_envs self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4, self.agent.device) def reset(self): # A potential bug is that, the frame stack is not properly reset in # a vectorized environment. We assume this will not impact the # performance significantly. self.frame_stack.reset() def __call__(self, obs): self.frame_stack.update(obs) action = self.agent.compute_action(self.frame_stack.get(), True)[1] if self.num_envs == 1: action = action.item() else: action = action.cpu().numpy() return action
def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs if args.envopt is not None: f = open(args.envopt) env_options = json.load(f) if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, options=env_options, ) if env_id == "Walker2d-v3": healthy_z_range = (0.8, 2.0) elif env_id == 'Humanoid-v3': healthy_z_range = (1.0, 2.0) if 'healthy_z_range' in env_options: healthy_z_range = env_options['healthy_z_range'] eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) if env_id == "Walker2d-v3": eval_env = Walker2d_wrapper(eval_env, env_options) obs_dim = envs.observation_space.shape[0] act_dim = envs.action_space.shape[0] real_obs_dim = obs_dim real_act_dim = act_dim if 'real_obs_dim' in trainer_options: real_obs_dim = trainer_options['real_obs_dim'] if 'real_act_dim' in trainer_options: real_act_dim = trainer_options['real_act_dim'] dim_dict = dict(obs_dim=obs_dim, act_dim=act_dim, real_obs_dim=real_obs_dim, real_act_dim=real_act_dim) # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, trainer_options) else: raise NotImplementedError # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor(num_envs, envs.observation_space.shape, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) while True: # Break when total_steps exceeds maximum value with sample_timer: for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dim) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device) rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) # Store samples trainer.rollouts.insert( reduce_shape(frame_stack_tensor.get(), real_obs_dim), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: policy_loss, value_loss, total_loss = trainer.update( trainer.rollouts) trainer.rollouts.after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict(policy_loss=policy_loss, value_loss=value_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs config.activation = nn.ReLU if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id main_envs = make_envs( env_id='Humanoid-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) aux_envs = make_envs( env_id='Walker2d-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) envs = [main_envs, aux_envs] # eval_env is main_env healthy_z_range = (1.0, 2.0) eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) main_obs_dim = 376 main_act_dim = 17 main_reduce_obs_dim = 46 main_reduce_act_dim = 11 aux_obs_dim = 17 aux_act_dim = 6 obs_dims = [main_reduce_obs_dim, aux_obs_dim] act_dims = [main_act_dim, aux_act_dim] dim_dict = dict(obs_a=main_reduce_obs_dim, act_a=main_reduce_act_dim, obs_b=aux_obs_dim, act_b=aux_act_dim, coeff_a=0.4, coeff_b=1) dim_dict['act_dim'] = 17 dim_dict['real_obs_dim'] = 46 # Setup trainer if algo == "PPO": trainer = PPOTrainerMTMT(config, dim_dict) else: raise NotImplementedError frame_stack_tensors = [ FrameStackTensor(num_envs, main_envs.observation_space.shape, config.device), FrameStackTensor(num_envs, aux_envs.observation_space.shape, config.device) ] # Setup some stats helpers episode_rewards = [ np.zeros([num_envs, 1], dtype=np.float), np.zeros([num_envs, 1], dtype=np.float) ] total_episodes = total_steps = iteration = 0 reward_recorders = [deque(maxlen=100), deque(maxlen=100)] episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)] sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = [envs[i].reset() for i in range(2)] _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)] # first update for i in range(2): trainer.rollouts[i].observations[0].copy_( reduce_shape(frame_stack_tensors[i].get(), obs_dims[i])) branch_names = ['a', 'b'] while True: # Break when total_steps exceeds maximum value with sample_timer: # prepare rollout a for ind in range(2): for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), deterministic=False, branch=branch_names[ind]) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dims[ind]) # obs, done, info not needed, we have masks & obs in frame_stack_tensors _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \ step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind], reward_recorders[ind], episode_length_recorders[ind], total_steps, total_episodes, config.device) if ind == 0: total_episodes = new_total_episodes total_steps = new_total_steps rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) trainer.rollouts[ind].insert( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): for i in range(2): next_value = trainer.compute_values( trainer.rollouts[i].observations[-1], branch_names[i]) trainer.rollouts[i].compute_returns( next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1]) policy_loss, value_loss, total_loss = list(zip(*losses)) trainer.rollouts[0].after_update() trainer.rollouts[1].after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() # seems ok, by default model is dealing with task1 rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward_a=summary(reward_recorders[0], "episode_reward"), training_episode_length_a=summary(episode_length_recorders[0], "episode_length"), training_episode_reward_b=summary(reward_recorders[1], "episode_reward"), training_episode_length_b=summary(episode_length_recorders[1], "episode_length"), evaluate_stats=evaluate_stat, learning_stats_a=dict(policy_loss=policy_loss[0], value_loss=value_loss[0], total_loss=total_loss[0]), learning_stats_b=dict(policy_loss=policy_loss[1], value_loss=value_loss[1], total_loss=total_loss[1]), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config elif algo == "A2C": config = a2c_config else: raise ValueError("args.algo must in [PPO, A2C]") config.num_envs = args.num_envs # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir(args.log_dir, algo) # Create vectorized environments num_envs = args.num_envs env_name = args.env_name # Prepare tensorboard file args.save_log = 'Pairtrding-{}'.format(time.strftime("%Y%m%d-%H%M%S")) generate_date = str(datetime.now().date()) writer = SummaryWriter(args.log_dir + '/runs/' + generate_date + '/' + args.save_log) # download stock price data from yahoo finance stocklist = [ '0700.hk', '2318.hk', '3988.hk', '0998.hk', '1398.hk', '3968.hk', '0981.hk', '0005.hk' ] # 腾讯,平安,中银,中信,工商,招商,中芯国际,汇丰 stocktickers = ' '.join(stocklist) data = yf.download(tickers=stocktickers, start="2010-01-01", end="2019-12-31") data = data['Close'] columnchange = [] for stock in data.columns: name = stock + 'change' columnchange.append(name) data[name] = data[stock] - data[stock].shift(1) CorrDict = {} for i in columnchange: for j in columnchange: if i != j and (i, j) not in CorrDict: CorrDict[(i, j)] = data[i].corr(data[j]) pair = list(max(CorrDict)) pair.append(pair[0][:7]) pair.append(pair[1][:7]) dataremain = data[pair] from sklearn import linear_model import numpy as np model = linear_model.LinearRegression() model.fit(dataremain[pair[0]][1:-250].to_numpy().reshape(-1, 1), y=dataremain[pair[1]][1:-250]) beta = model.coef_[0] dataremain['Spread'] = beta * data[pair[0]] - data[pair[1]] Spreadmean = dataremain['Spread'].mean() Spreadstd = dataremain['Spread'].std() dataremain['Z-score'] = (dataremain['Spread'] - Spreadmean) / Spreadstd envs = PairtradingEnv(stock1=dataremain[pair[2]][:-250], stock2=dataremain[pair[3]][:-250]) eval_envs = PairtradingEnv(stock1=dataremain[pair[2]][-250:], stock2=dataremain[pair[3]][-250:]) baseline_config = baselineConfig(mean=Spreadmean, std=Spreadstd, beta=beta) baseline_trainer = baseline(env=envs, config=baseline_config) baseline_eval = baseline(env=eval_envs, config=baseline_config) test = env_name == "CartPole-v0" frame_stack = args.input_length if not test else 1 # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, frame_stack, _test=test) else: trainer = A2CTrainer(envs, config, frame_stack, _test=test) # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor( num_envs, envs.observation_space.shape, frame_stack, config.device) # envs.observation_space.shape: 1,42,42 # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) episode_values = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== # episode_values = [] episode_rewards = np.zeros([num_envs, 1], dtype=np.float) for env_id in range(num_envs): obs = envs.reset() # obs.shape: 15,1,42,42 frame_stack_tensor.update(obs, env_id) trainer.rollouts.observations[0, env_id].copy_( frame_stack_tensor.get(env_id) ) #trainer.rollouts.observations.shape: torch.Size([201, 15, 4, 42, 42]) with sample_timer: for index in range(config.num_steps): # Get action # [TODO] Get the action # Hint: # 1. Remember to disable gradient computing # 2. trainer.rollouts is a storage containing all data # 3. What observation is needed for trainer.compute_action? with torch.no_grad(): values, actions_cash, action_log_prob_cash, actions_beta, action_log_prob_beta = trainer.compute_action( trainer.rollouts.observations[index, env_id]) act = baseline_trainer.compute_action( actions_cash.view(-1), actions_beta.view(-1)) cpu_actions = act # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, masks, total_episodes, \ total_steps, episode_rewards, episode_values = step_envs( cpu_actions, envs, env_id, episode_rewards, episode_values, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device, test) rewards = torch.from_numpy( np.array(reward).astype(np.float32)).view(-1).to( config.device) # Store samples trainer.rollouts.insert(frame_stack_tensor.get(env_id), actions_cash.view(-1), action_log_prob_cash.view(-1), actions_beta.view(-1), action_log_prob_beta.view(-1), values.view(-1), rewards, masks.view(-1), env_id) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = \ trainer.update(trainer.rollouts) trainer.rollouts.after_update() # Add training statistics to tensorboard log file writer.add_scalar('train_policy_loss', policy_loss, iteration) writer.add_scalar('train_value_loss', value_loss, iteration) writer.add_scalar('train_dist_entropy', dist_entropy, iteration) writer.add_scalar('train_total_loss', total_loss, iteration) writer.add_scalar('train_episode_rewards', np.mean(episode_rewards), iteration) writer.add_scalar('train_episode_values', np.array(episode_values).mean(), iteration) # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths, evaluate_values = evaluate( trainer, eval_envs, baseline_eval, frame_stack, 5) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update( dict(win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len(evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration, evaluate_values=float(np.array(evaluate_values).mean()))) # Add evaluation statistics to tensorboard log file writer.add_scalar('eval_episode_rewards', np.array(evaluate_rewards).mean(), iteration // config.eval_freq) writer.add_scalar('eval_episode_values', np.array(evaluate_values).mean(), iteration // config.eval_freq) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_values=summary(episode_values, "episode_value"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict(policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) if iteration >= args.max_steps: break iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config elif algo == "A2C": config = a2c_config else: raise ValueError("args.algo must in [PPO, A2C]") config.num_envs = args.num_envs assert args.env_id in ["cPong-v0", "CartPole-v0", "cPongTournament-v0"] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir(args.log_dir, algo) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, resized_dim=config.resized_dim ) eval_envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=False, resized_dim=config.resized_dim ) test = env_id == "CartPole-v0" tournament = env_id == "cPongTournament-v0" frame_stack = 4 if not test else 1 if tournament: assert algo == "PPO", "Using PPO in tournament is a good idea, " \ "because of its efficiency compared to A2C." # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, frame_stack, _test=test) else: trainer = A2CTrainer(envs, config, frame_stack, _test=test) # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor( num_envs, envs.observation_space.shape, frame_stack, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_(frame_stack_tensor.get()) while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== with sample_timer: for index in range(config.num_steps): # Get action # [TODO] Get the action # Hint: # 1. Remember to disable gradient computing # 2. trainer.rollouts is a storage containing all data # 3. What observation is needed for trainer.compute_action? with torch.no_grad(): values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index]) cpu_actions = actions.view(-1).cpu().numpy() # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device, test) rewards = torch.from_numpy( reward.astype(np.float32)).view(-1, 1).to(config.device) # Store samples trainer.rollouts.insert( frame_stack_tensor.get(), actions.view(-1, 1), action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = \ trainer.update(trainer.rollouts) trainer.rollouts.after_update() # ===== Reset opponent if in tournament mode ===== if tournament and iteration % config.num_steps == 0: # Randomly choose one agent in each iteration envs.reset_opponent() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths = evaluate( trainer, eval_envs, frame_stack, 20) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update(dict( win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len( evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration )) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict( policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, total_loss=total_loss ), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict( sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg ), iteration=iteration ) if tournament: stats["opponent"] = envs.current_agent_name progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format( algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print("Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path )) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break iteration += 1 trainer.save_w(log_dir, "final") envs.close()