def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs if args.envopt is not None: f = open(args.envopt) env_options = json.load(f) if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, options=env_options, ) if env_id == "Walker2d-v3": healthy_z_range = (0.8, 2.0) elif env_id == 'Humanoid-v3': healthy_z_range = (1.0, 2.0) if 'healthy_z_range' in env_options: healthy_z_range = env_options['healthy_z_range'] eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) if env_id == "Walker2d-v3": eval_env = Walker2d_wrapper(eval_env, env_options) obs_dim = envs.observation_space.shape[0] act_dim = envs.action_space.shape[0] real_obs_dim = obs_dim real_act_dim = act_dim if 'real_obs_dim' in trainer_options: real_obs_dim = trainer_options['real_obs_dim'] if 'real_act_dim' in trainer_options: real_act_dim = trainer_options['real_act_dim'] dim_dict = dict(obs_dim=obs_dim, act_dim=act_dim, real_obs_dim=real_obs_dim, real_act_dim=real_act_dim) # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, trainer_options) else: raise NotImplementedError # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor(num_envs, envs.observation_space.shape, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) while True: # Break when total_steps exceeds maximum value with sample_timer: for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dim) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device) rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) # Store samples trainer.rollouts.insert( reduce_shape(frame_stack_tensor.get(), real_obs_dim), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: policy_loss, value_loss, total_loss = trainer.update( trainer.rollouts) trainer.rollouts.after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict(policy_loss=policy_loss, value_loss=value_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs config.activation = nn.ReLU if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id main_envs = make_envs( env_id='Humanoid-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) aux_envs = make_envs( env_id='Walker2d-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) envs = [main_envs, aux_envs] # eval_env is main_env healthy_z_range = (1.0, 2.0) eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) main_obs_dim = 376 main_act_dim = 17 main_reduce_obs_dim = 46 main_reduce_act_dim = 11 aux_obs_dim = 17 aux_act_dim = 6 obs_dims = [main_reduce_obs_dim, aux_obs_dim] act_dims = [main_act_dim, aux_act_dim] dim_dict = dict(obs_a=main_reduce_obs_dim, act_a=main_reduce_act_dim, obs_b=aux_obs_dim, act_b=aux_act_dim, coeff_a=0.4, coeff_b=1) dim_dict['act_dim'] = 17 dim_dict['real_obs_dim'] = 46 # Setup trainer if algo == "PPO": trainer = PPOTrainerMTMT(config, dim_dict) else: raise NotImplementedError frame_stack_tensors = [ FrameStackTensor(num_envs, main_envs.observation_space.shape, config.device), FrameStackTensor(num_envs, aux_envs.observation_space.shape, config.device) ] # Setup some stats helpers episode_rewards = [ np.zeros([num_envs, 1], dtype=np.float), np.zeros([num_envs, 1], dtype=np.float) ] total_episodes = total_steps = iteration = 0 reward_recorders = [deque(maxlen=100), deque(maxlen=100)] episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)] sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = [envs[i].reset() for i in range(2)] _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)] # first update for i in range(2): trainer.rollouts[i].observations[0].copy_( reduce_shape(frame_stack_tensors[i].get(), obs_dims[i])) branch_names = ['a', 'b'] while True: # Break when total_steps exceeds maximum value with sample_timer: # prepare rollout a for ind in range(2): for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), deterministic=False, branch=branch_names[ind]) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dims[ind]) # obs, done, info not needed, we have masks & obs in frame_stack_tensors _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \ step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind], reward_recorders[ind], episode_length_recorders[ind], total_steps, total_episodes, config.device) if ind == 0: total_episodes = new_total_episodes total_steps = new_total_steps rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) trainer.rollouts[ind].insert( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): for i in range(2): next_value = trainer.compute_values( trainer.rollouts[i].observations[-1], branch_names[i]) trainer.rollouts[i].compute_returns( next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1]) policy_loss, value_loss, total_loss = list(zip(*losses)) trainer.rollouts[0].after_update() trainer.rollouts[1].after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() # seems ok, by default model is dealing with task1 rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward_a=summary(reward_recorders[0], "episode_reward"), training_episode_length_a=summary(episode_length_recorders[0], "episode_length"), training_episode_reward_b=summary(reward_recorders[1], "episode_reward"), training_episode_length_b=summary(episode_length_recorders[1], "episode_length"), evaluate_stats=evaluate_stat, learning_stats_a=dict(policy_loss=policy_loss[0], value_loss=value_loss[0], total_loss=total_loss[0]), learning_stats_b=dict(policy_loss=policy_loss[1], value_loss=value_loss[1], total_loss=total_loss[1]), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config elif algo == "A2C": config = a2c_config else: raise ValueError("args.algo must in [PPO, A2C]") config.num_envs = args.num_envs assert args.env_id in ["cPong-v0", "CartPole-v0", "cPongTournament-v0"] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir(args.log_dir, algo) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, resized_dim=config.resized_dim ) eval_envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=False, resized_dim=config.resized_dim ) test = env_id == "CartPole-v0" tournament = env_id == "cPongTournament-v0" frame_stack = 4 if not test else 1 if tournament: assert algo == "PPO", "Using PPO in tournament is a good idea, " \ "because of its efficiency compared to A2C." # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, frame_stack, _test=test) else: trainer = A2CTrainer(envs, config, frame_stack, _test=test) # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor( num_envs, envs.observation_space.shape, frame_stack, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_(frame_stack_tensor.get()) while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== with sample_timer: for index in range(config.num_steps): # Get action # [TODO] Get the action # Hint: # 1. Remember to disable gradient computing # 2. trainer.rollouts is a storage containing all data # 3. What observation is needed for trainer.compute_action? with torch.no_grad(): values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index]) cpu_actions = actions.view(-1).cpu().numpy() # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device, test) rewards = torch.from_numpy( reward.astype(np.float32)).view(-1, 1).to(config.device) # Store samples trainer.rollouts.insert( frame_stack_tensor.get(), actions.view(-1, 1), action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = \ trainer.update(trainer.rollouts) trainer.rollouts.after_update() # ===== Reset opponent if in tournament mode ===== if tournament and iteration % config.num_steps == 0: # Randomly choose one agent in each iteration envs.reset_opponent() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths = evaluate( trainer, eval_envs, frame_stack, 20) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update(dict( win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len( evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration )) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict( policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, total_loss=total_loss ), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict( sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg ), iteration=iteration ) if tournament: stats["opponent"] = envs.current_agent_name progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format( algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print("Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path )) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config elif algo == "A2C": config = a2c_config else: raise ValueError("args.algo must in [PPO, A2C]") config.num_envs = args.num_envs # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir(args.log_dir, algo) # Create vectorized environments num_envs = args.num_envs env_name = args.env_name # Prepare tensorboard file args.save_log = 'Pairtrding-{}'.format(time.strftime("%Y%m%d-%H%M%S")) generate_date = str(datetime.now().date()) writer = SummaryWriter(args.log_dir + '/runs/' + generate_date + '/' + args.save_log) # download stock price data from yahoo finance stocklist = [ '0700.hk', '2318.hk', '3988.hk', '0998.hk', '1398.hk', '3968.hk', '0981.hk', '0005.hk' ] # 腾讯,平安,中银,中信,工商,招商,中芯国际,汇丰 stocktickers = ' '.join(stocklist) data = yf.download(tickers=stocktickers, start="2010-01-01", end="2019-12-31") data = data['Close'] columnchange = [] for stock in data.columns: name = stock + 'change' columnchange.append(name) data[name] = data[stock] - data[stock].shift(1) CorrDict = {} for i in columnchange: for j in columnchange: if i != j and (i, j) not in CorrDict: CorrDict[(i, j)] = data[i].corr(data[j]) pair = list(max(CorrDict)) pair.append(pair[0][:7]) pair.append(pair[1][:7]) dataremain = data[pair] from sklearn import linear_model import numpy as np model = linear_model.LinearRegression() model.fit(dataremain[pair[0]][1:-250].to_numpy().reshape(-1, 1), y=dataremain[pair[1]][1:-250]) beta = model.coef_[0] dataremain['Spread'] = beta * data[pair[0]] - data[pair[1]] Spreadmean = dataremain['Spread'].mean() Spreadstd = dataremain['Spread'].std() dataremain['Z-score'] = (dataremain['Spread'] - Spreadmean) / Spreadstd envs = PairtradingEnv(stock1=dataremain[pair[2]][:-250], stock2=dataremain[pair[3]][:-250]) eval_envs = PairtradingEnv(stock1=dataremain[pair[2]][-250:], stock2=dataremain[pair[3]][-250:]) baseline_config = baselineConfig(mean=Spreadmean, std=Spreadstd, beta=beta) baseline_trainer = baseline(env=envs, config=baseline_config) baseline_eval = baseline(env=eval_envs, config=baseline_config) test = env_name == "CartPole-v0" frame_stack = args.input_length if not test else 1 # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, frame_stack, _test=test) else: trainer = A2CTrainer(envs, config, frame_stack, _test=test) # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor( num_envs, envs.observation_space.shape, frame_stack, config.device) # envs.observation_space.shape: 1,42,42 # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) episode_values = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== # episode_values = [] episode_rewards = np.zeros([num_envs, 1], dtype=np.float) for env_id in range(num_envs): obs = envs.reset() # obs.shape: 15,1,42,42 frame_stack_tensor.update(obs, env_id) trainer.rollouts.observations[0, env_id].copy_( frame_stack_tensor.get(env_id) ) #trainer.rollouts.observations.shape: torch.Size([201, 15, 4, 42, 42]) with sample_timer: for index in range(config.num_steps): # Get action # [TODO] Get the action # Hint: # 1. Remember to disable gradient computing # 2. trainer.rollouts is a storage containing all data # 3. What observation is needed for trainer.compute_action? with torch.no_grad(): values, actions_cash, action_log_prob_cash, actions_beta, action_log_prob_beta = trainer.compute_action( trainer.rollouts.observations[index, env_id]) act = baseline_trainer.compute_action( actions_cash.view(-1), actions_beta.view(-1)) cpu_actions = act # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, masks, total_episodes, \ total_steps, episode_rewards, episode_values = step_envs( cpu_actions, envs, env_id, episode_rewards, episode_values, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device, test) rewards = torch.from_numpy( np.array(reward).astype(np.float32)).view(-1).to( config.device) # Store samples trainer.rollouts.insert(frame_stack_tensor.get(env_id), actions_cash.view(-1), action_log_prob_cash.view(-1), actions_beta.view(-1), action_log_prob_beta.view(-1), values.view(-1), rewards, masks.view(-1), env_id) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = \ trainer.update(trainer.rollouts) trainer.rollouts.after_update() # Add training statistics to tensorboard log file writer.add_scalar('train_policy_loss', policy_loss, iteration) writer.add_scalar('train_value_loss', value_loss, iteration) writer.add_scalar('train_dist_entropy', dist_entropy, iteration) writer.add_scalar('train_total_loss', total_loss, iteration) writer.add_scalar('train_episode_rewards', np.mean(episode_rewards), iteration) writer.add_scalar('train_episode_values', np.array(episode_values).mean(), iteration) # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths, evaluate_values = evaluate( trainer, eval_envs, baseline_eval, frame_stack, 5) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update( dict(win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len(evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration, evaluate_values=float(np.array(evaluate_values).mean()))) # Add evaluation statistics to tensorboard log file writer.add_scalar('eval_episode_rewards', np.array(evaluate_rewards).mean(), iteration // config.eval_freq) writer.add_scalar('eval_episode_values', np.array(evaluate_values).mean(), iteration // config.eval_freq) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_values=summary(episode_values, "episode_value"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict(policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) if iteration >= args.max_steps: break iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def _train(trainer, envs, eval_envs, config, num_envs, algo, log_dir, tournament, test): # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== with sample_timer: for index in range(config.num_steps): # Get action if hasattr(trainer.model, 'reset_state'): trainer.model.reset_state() with torch.no_grad(): values, actions, action_log_prob = trainer.compute_action( trainer.rollouts.processed_observations[index]) trainer.model.update_hidden(actions) if trainer.discrete: cpu_actions = actions.view(-1).cpu().numpy() else: cpu_actions = actions.cpu().numpy() # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, info, masks, total_episodes, total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device) rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) # Store samples if trainer.discrete: actions = actions.view(-1, 1) with torch.no_grad(): raw_obs = trainer.process_obs(obs) processed_obs = trainer.model.world_model(raw_obs).detach() trainer.rollouts.insert(obs, actions, action_log_prob, values, rewards, masks, processed_obs) # trainer.rollouts.insert(obs, actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.processed_observations[-1]) trainer.rollouts.compute_returns(next_value, config.gamma) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = trainer.update( trainer.rollouts) # vae_loss, mdrnn_loss\ # = trainer.update(trainer.rollouts) trainer.model.reset_state() trainer.rollouts.after_update() # ===== Reset opponent if in tournament mode ===== if tournament and iteration % config.num_steps == 0: # Randomly choose one agent in each iteration envs.reset_opponent() # ===== Evaluate Current Policy ===== if eval_envs is not None and iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths = evaluate( trainer, eval_envs, 20) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update( dict(win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len(evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict( policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, # vae_loss= vae_loss, # mdrnn_loss=mdrnn_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) if tournament: stats["opponent"] = envs.current_agent_name progress.append(stats) from IPython.display import clear_output clear_output() pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) progress_path = save_progress(log_dir, progress) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) if total_steps > int(args.max_steps): break iteration += 1