def main(): args, lr_args, log_dir, preprocess_wrapper = parse_args() easy_tf_log.set_dir(log_dir) utils_tensorflow.set_random_seeds(args.seed) sess = tf.Session() envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops, args.n_workers, args.seed, args.debug, log_dir) step_counter = utils.TensorFlowCounter(sess) update_counter = utils.TensorFlowCounter(sess) lr = make_lr(lr_args, step_counter.value) optimizer = make_optimizer(lr) networks = make_networks(n_workers=args.n_workers, obs_shape=envs[0].observation_space.shape, n_actions=envs[0].action_space.n, value_loss_coef=args.value_loss_coef, entropy_bonus=args.entropy_bonus, max_grad_norm=args.max_grad_norm, optimizer=optimizer, detailed_logs=args.detailed_logs, debug=args.debug) global_vars = tf.trainable_variables('global') # Why save_relative_paths=True? # So that the plain-text 'checkpoint' file written uses relative paths, so that we can restore # from checkpoints created on another machine. saver = tf.train.Saver(global_vars, max_to_keep=1, save_relative_paths=True) if args.load_ckpt: print("Restoring from checkpoint '{}'...".format(args.load_ckpt), end='', flush=True) saver.restore(sess, args.load_ckpt) print("done!") else: sess.run(tf.global_variables_initializer()) workers = make_workers(sess, envs, networks, args.n_workers, log_dir) worker_threads = start_worker_threads(workers, args.n_steps, args.steps_per_update, step_counter, update_counter) run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver, args.manager_wake_interval_seconds, args.ckpt_interval_seconds) for env in envs: env.close()
def main(): args, lr_args, log_dir, preprocess_wrapper = parse_args() # parse_args() é importado de params easy_tf_log.set_dir(log_dir) # seta o caminho dos logs em easy_ty_log utils_tensorflow.set_random_seeds(args.seed) # iniciando a semente aleatóriamente sess = tf.Session() # Uma classe para executar operações do TensorFlow. Um Sessionobjeto encapsula o ambiente no qual os Operation objetos são executados e os Tensorobjetos são avaliados. envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops, args.n_workers, args.seed, args.debug, log_dir) step_counter = utils.TensorFlowCounter(sess) update_counter = utils.TensorFlowCounter(sess) lr = make_lr(lr_args, step_counter.value) optimizer = make_optimizer(lr) # Criando o conjunto de redes por threads networks = make_networks(n_workers=args.n_workers, obs_shape=envs[0].observation_space.shape, n_actions=envs[0].action_space.n, value_loss_coef=args.value_loss_coef, entropy_bonus=args.entropy_bonus, max_grad_norm=args.max_grad_norm, optimizer=optimizer, detailed_logs=args.detailed_logs, debug=args.debug) # Retorna todas as variáveis criadas com trainable=True. # scope: (Opcional.) Uma string. Se fornecida, a lista resultante é filtrada para incluir apenas itens cujo nameatributo corresponde ao scopeuso re.match global_vars = tf.trainable_variables('global') # Por que save_relative_paths = True? # De modo que o arquivo de 'checkpoint' em texto simples use caminhos relativos, # para que possamos restaurar a partir de pontos de verificação criados em outra máquina. saver = tf.train.Saver(global_vars, max_to_keep=1, save_relative_paths=True) # se existir um checkpoint para carregar ele restaura os dados para proceguir de onde parou, caso contrário ele inicia do 0 if args.load_ckpt: print("Restoring from checkpoint '{}'...".format(args.load_ckpt), end='', flush=True) saver.restore(sess, args.load_ckpt) # restaura(carrega) a sessão do checkpoint especificado print("done!") else: sess.run(tf.global_variables_initializer()) # Criando as workes workers = make_workers(sess, envs, networks, args.n_workers, log_dir) # inicia as threads referente a cada workers criada worker_threads = start_worker_threads(workers, args.n_steps, args.steps_per_update, step_counter, update_counter) # Gerenciador de execução das workers_threads run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver, args.manager_wake_interval_seconds, args.ckpt_interval_seconds) for env in envs: env.close()
def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs if args.envopt is not None: f = open(args.envopt) env_options = json.load(f) if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, options=env_options, ) if env_id == "Walker2d-v3": healthy_z_range = (0.8, 2.0) elif env_id == 'Humanoid-v3': healthy_z_range = (1.0, 2.0) if 'healthy_z_range' in env_options: healthy_z_range = env_options['healthy_z_range'] eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) if env_id == "Walker2d-v3": eval_env = Walker2d_wrapper(eval_env, env_options) obs_dim = envs.observation_space.shape[0] act_dim = envs.action_space.shape[0] real_obs_dim = obs_dim real_act_dim = act_dim if 'real_obs_dim' in trainer_options: real_obs_dim = trainer_options['real_obs_dim'] if 'real_act_dim' in trainer_options: real_act_dim = trainer_options['real_act_dim'] dim_dict = dict(obs_dim=obs_dim, act_dim=act_dim, real_obs_dim=real_obs_dim, real_act_dim=real_act_dim) # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, trainer_options) else: raise NotImplementedError # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor(num_envs, envs.observation_space.shape, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) while True: # Break when total_steps exceeds maximum value with sample_timer: for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensor.get(), real_obs_dim)) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dim) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device) rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) # Store samples trainer.rollouts.insert( reduce_shape(frame_stack_tensor.get(), real_obs_dim), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: policy_loss, value_loss, total_loss = trainer.update( trainer.rollouts) trainer.rollouts.after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict(policy_loss=policy_loss, value_loss=value_loss, total_loss=total_loss), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config global env_options, trainer_options algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs config.activation = nn.ReLU if args.trainopt is not None: f = open(args.trainopt) trainer_options = json.load(f) if args.opt is not None: opt = json.load(open(args.opt)) env_options = opt['env'] trainer_options = opt['trainer'] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir('work_dirs', args.log_dir) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id main_envs = make_envs( env_id='Humanoid-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) aux_envs = make_envs( env_id='Walker2d-v3', seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, ) envs = [main_envs, aux_envs] # eval_env is main_env healthy_z_range = (1.0, 2.0) eval_env = gym.make(env_id, healthy_z_range=healthy_z_range, healthy_reward=0) main_obs_dim = 376 main_act_dim = 17 main_reduce_obs_dim = 46 main_reduce_act_dim = 11 aux_obs_dim = 17 aux_act_dim = 6 obs_dims = [main_reduce_obs_dim, aux_obs_dim] act_dims = [main_act_dim, aux_act_dim] dim_dict = dict(obs_a=main_reduce_obs_dim, act_a=main_reduce_act_dim, obs_b=aux_obs_dim, act_b=aux_act_dim, coeff_a=0.4, coeff_b=1) dim_dict['act_dim'] = 17 dim_dict['real_obs_dim'] = 46 # Setup trainer if algo == "PPO": trainer = PPOTrainerMTMT(config, dim_dict) else: raise NotImplementedError frame_stack_tensors = [ FrameStackTensor(num_envs, main_envs.observation_space.shape, config.device), FrameStackTensor(num_envs, aux_envs.observation_space.shape, config.device) ] # Setup some stats helpers episode_rewards = [ np.zeros([num_envs, 1], dtype=np.float), np.zeros([num_envs, 1], dtype=np.float) ] total_episodes = total_steps = iteration = 0 reward_recorders = [deque(maxlen=100), deque(maxlen=100)] episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)] sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = [envs[i].reset() for i in range(2)] _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)] # first update for i in range(2): trainer.rollouts[i].observations[0].copy_( reduce_shape(frame_stack_tensors[i].get(), obs_dims[i])) branch_names = ['a', 'b'] while True: # Break when total_steps exceeds maximum value with sample_timer: # prepare rollout a for ind in range(2): for index in range(config.num_steps): trainer.model.eval() values, actions, action_log_prob = trainer.model.step( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), deterministic=False, branch=branch_names[ind]) cpu_actions = actions.cpu().numpy() cpu_actions = enlarge_shape(cpu_actions, act_dims[ind]) # obs, done, info not needed, we have masks & obs in frame_stack_tensors _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \ step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind], reward_recorders[ind], episode_length_recorders[ind], total_steps, total_episodes, config.device) if ind == 0: total_episodes = new_total_episodes total_steps = new_total_steps rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) trainer.rollouts[ind].insert( reduce_shape(frame_stack_tensors[ind].get(), obs_dims[ind]), actions, action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): for i in range(2): next_value = trainer.compute_values( trainer.rollouts[i].observations[-1], branch_names[i]) trainer.rollouts[i].compute_returns( next_value, config.GAMMA) trainer.model.train() # ===== Update Policy ===== with update_timer: losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1]) policy_loss, value_loss, total_loss = list(zip(*losses)) trainer.rollouts[0].after_update() trainer.rollouts[1].after_update() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() # seems ok, by default model is dealing with task1 rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict) evaluate_stat = summary(rewards, "episode_reward") evaluate_stat.update(summary(eplens, "episode_length")) evaluate_stat.update( dict(evaluate_time=eval_timer.now, evaluate_iteration=iteration)) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward_a=summary(reward_recorders[0], "episode_reward"), training_episode_length_a=summary(episode_length_recorders[0], "episode_length"), training_episode_reward_b=summary(reward_recorders[1], "episode_reward"), training_episode_length_b=summary(episode_length_recorders[1], "episode_length"), evaluate_stats=evaluate_stat, learning_stats_a=dict(policy_loss=policy_loss[0], value_loss=value_loss[0], total_loss=total_loss[0]), learning_stats_b=dict(policy_loss=policy_loss[1], value_loss=value_loss[1], total_loss=total_loss[1]), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict(sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg), iteration=iteration) progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format(algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print( "Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path)) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break pass iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(cfg): print('Start to train ! \n') envs = make_envs(num_envs=16, env_name="CartPole-v0") state_dim = envs.observation_space.shape[0] action_dim = envs.action_space.n device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = A2C(state_dim, action_dim, hidden_dim=256) # moving_average_rewards = [] # ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) state = envs.reset() for i_episode in range(1, cfg.train_eps + 1): log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for i_step in range(1, cfg.train_steps + 1): state = torch.FloatTensor(state).to(device) dist, value = agent.model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) state = next_state log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) if i_episode % 20 == 0: print("reward", test_env(agent, device='cpu')) next_state = torch.FloatTensor(next_state).to(device) _, next_value = agent.model(next_state) returns = agent.compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy agent.optimizer.zero_grad() loss.backward() agent.optimizer.step() for _ in range(100): print("test_reward", test_env(agent, device='cpu')) # print('Episode:', i_episode, ' Reward: %i' % # int(ep_reward[0]), 'n_steps:', i_step) # ep_steps.append(i_step) # rewards.append(ep_reward) # if i_episode == 1: # moving_average_rewards.append(ep_reward[0]) # else: # moving_average_rewards.append( # 0.9*moving_average_rewards[-1]+0.1*ep_reward[0]) # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) # writer.add_scalar('steps_of_each_episode', # ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 '''