def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now( ).strftime("%Y-%m-%d-%H-%M-%S-%f") log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir, args.save_dir) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, args.add_timestep, device, False, frame_skip=args.frame_skip) if args.load_path: actor_critic, _ob_rms = torch.load(args.load_path) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.train() vec_norm.ob_rms = _ob_rms actor_critic.train() else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, beta=args.beta_dist, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, beta=args.sil_beta, value_loss_coef=args.sil_value_loss_coef, entropy_coef=args.sil_entropy_coef) replay = ReplayStorage(10000, num_processes=args.num_processes, gamma=args.gamma, prio_alpha=args.sil_alpha, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=actor_critic. recurrent_hidden_state_size, device=device) else: replay = None action_high = torch.from_numpy(envs.action_space.high).to(device) action_low = torch.from_numpy(envs.action_space.low).to(device) action_mid = 0.5 * (action_high + action_low) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) benchmark_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): # sample actions value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): clipped_action = action.clone() if args.shift_action: # FIXME experimenting with this, so far resulting in # faster learning when clipping guassian continuous # output (vs leaving centred at 0 and unscaled) clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) else: clipped_action = action # act in environment and observe obs, reward, done, infos = envs.step(clipped_action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if 'rb' in info['episode']: benchmark_rewards.append(info['episode']['rb']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update( rollouts, j, replay) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps train_eprew = np.mean(episode_rewards) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), train_eprew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end='') if len(benchmark_rewards): print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format( np.mean(benchmark_rewards), np.median(benchmark_rewards), np.min(benchmark_rewards), np.max(benchmark_rewards)), end='') print() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) clipped_action = action if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): if args.shift_action: clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) obs, reward, done, infos = eval_envs.step(clipped_action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() eval_eprew = np.mean(eval_episode_rewards) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), eval_eprew)) if len(episode_rewards ) and j % args.save_interval == 0 and save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] ep_rewstr = ("%d" % train_eprew).replace("-", "n") save_filename = os.path.join( save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr)) torch.save(save_model, save_filename) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None train_envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.no_norm, args.num_stack, args.log_dir, args.add_timestep, device, allow_early_resets=False) if args.eval_interval: eval_seed = args.seed if args.seed is None else args.seed + args.num_processes eval_envs = make_vec_envs(args.env_name, eval_seed, args.num_processes // 4, args.gamma, args.no_norm, args.num_stack, eval_log_dir, args.add_timestep, device=device, allow_early_resets=True, eval=True, rank_offsest=args.num_processes) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = train_envs.venv.ob_rms else: eval_envs = None print(train_envs.observation_space.shape) noisy_net = True actor_critic = create_policy( train_envs.observation_space, train_envs.action_space, name='basic', nn_kwargs={ #'batch_norm': False if args.algo == 'acktr' else True, 'recurrent': 'lstm' if args.recurrent_policy else '', 'hidden_size': 512, }, noisy_net=noisy_net, train=True) if args.resume and os.path.isfile(args.resume): print('Resuming from checkpoint (%s)' % args.resume) state_dict, ob_rms = torch.load(args.resume, map_location='cpu') actor_critic.load_state_dict(state_dict) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, value_loss_coef=args.sil_value_loss_coef or args.value_loss_coef, entropy_coef=args.sil_entropy_coef or args.entropy_coef) replay = ReplayStorage(1e5, args.num_processes, args.gamma, 0.1, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size, device=device) else: replay = None rollouts = RolloutStorage(args.num_steps, args.num_processes, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = train_envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if noisy_net: actor_critic.reset_noise() for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = train_envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, other_metrics = agent.update( rollouts, j, replay) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model.state_dict(), hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * update_factor if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, " "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end=', ' if other_metrics else '\n') if 'sil_value_loss' in other_metrics: print("SIL value/action loss {:.1f}/{:.1f}.".format( other_metrics['sil_value_loss'], other_metrics['sil_action_loss'])) if args.eval_interval and len( episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0: actor_critic.eval() eval_episode_rewards = [] num_eval_processes = args.num_processes // 4 obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( 2, num_eval_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_eval_processes, 1, device=device) while len(eval_episode_rewards) < 50: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) actor_critic.train() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from torch.utils.tensorboard import SummaryWriter save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass viz = SummaryWriter( os.path.join( save_path, args.env_name + time.strftime("_%d_%b_%H_%M", time.localtime()))) train_envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.no_norm, args.num_stack, args.log_dir, args.add_timestep, device, allow_early_resets=False) if args.eval_interval: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, args.no_norm, args.num_stack, eval_log_dir, args.add_timestep, device, allow_early_resets=True, eval=True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = train_envs.venv.ob_rms else: eval_envs = None # FIXME this is very specific to Pommerman env right now actor_critic = create_policy(train_envs.observation_space, train_envs.action_space, name='pomm', nn_kwargs={ 'batch_norm': False if args.algo == 'acktr' else True, 'recurrent': args.recurrent_policy, 'hidden_size': 512, }, train=True) if args.load_path != '': actor_critic.load_state_dict(state_dict) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, value_loss_coef=args.sil_value_loss_coef or args.value_loss_coef, entropy_coef=args.sil_entropy_coef or args.entropy_coef) replay = ReplayStorage(5e5, args.num_processes, args.gamma, 0.1, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size, device=device) else: replay = None rollouts = RolloutStorage(args.num_steps, args.num_processes, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = train_envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = train_envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, other_metrics = agent.update( rollouts, j, replay) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model.state_dict(), hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * update_factor if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, " "min / max reward {:.1f}/{:.1f}, dist_entropy{:.5f} value/action loss {:.5f}/{:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end=', ' if other_metrics else '\n') if 'sil_value_loss' in other_metrics: print("SIL value/action loss {:.1f}/{:.1f}.".format( other_metrics['sil_value_loss'], other_metrics['sil_action_loss'])) if args.vis: viz.add_scalar('episode_rewards/mean', np.mean(episode_rewards), total_num_steps) viz.add_scalar('episode_rewards/median', np.median(episode_rewards), total_num_steps) viz.add_scalar('episode_rewards/min', np.min(episode_rewards), total_num_steps) viz.add_scalar('episode_rewards/max', np.max(episode_rewards), total_num_steps) viz.add_scalar('train/value_loss', value_loss, total_num_steps) viz.add_scalar('train/action_loss', action_loss, total_num_steps) viz.add_scalar('train/dist_entropy', dist_entropy, total_num_steps) if args.eval_interval and len( episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0: eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 50: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) #The network actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, beta=args.sil_beta, value_loss_coef=args.sil_value_loss_coef, entropy_coef=args.sil_entropy_coef) replay = ReplayStorage(10000, num_processes=args.num_processes, gamma=args.gamma, prio_alpha=args.sil_alpha, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=actor_critic. recurrent_hidden_state_size, device=device) else: replay = None rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update( rollouts, j, replay) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass