def main(): ARGUMENTS.update(vars(args)) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ALL_UPDATES.append(j) ALL_TIMESTEPS.append(total_num_steps) ALL_FPS.append(int(total_num_steps / (end - start))) ALL_MEAN_REWARDS.append(np.mean(episode_rewards)) ALL_MEDIAN_REWARDS.append(np.median(episode_rewards)) ALL_MIN_REWARDS.append(np.min(episode_rewards)) ALL_MAX_REWARDS.append(np.max(episode_rewards)) ALL_DIST_ENTROPY.append(dist_entropy) ALL_VALUE_LOSS.append(value_loss) ALL_ACTION_LOSS.append(action_loss) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass # Save the results name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise']) experiment = ro.Experiment(name, directory='results') data = { 'updates': ALL_UPDATES, 'timesteps': ALL_TIMESTEPS, 'fps': ALL_FPS, 'mean_rewards': ALL_MEAN_REWARDS, 'median_rewards': ALL_MEDIAN_REWARDS, 'min_rewards': ALL_MIN_REWARDS, 'max_rewards': ALL_MAX_REWARDS, 'dist_entropy': ALL_DIST_ENTROPY, 'value_loss': ALL_VALUE_LOSS, 'action_loss': ALL_ACTION_LOSS, } data.update(ARGUMENTS) result = data['mean_rewards'][-1] experiment.add_result(result, data)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) if args.load_policy is not None: actor_critic, ob_rms = torch.load(args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() snapshot_counter = 0 last_delete = -1 try: os.makedirs(os.path.join(args.save_dir, args.algo)) except OSError: pass log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'), 'w') for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + "epoch_{:07d}.pt".format(j))) snapshot_counter += 1 last_delete += 1 if snapshot_counter > 100: os.system('rm ' + os.path.join( save_path, args.env_name + 'epoch_{:07d}.py'.format(last_delete))) snapshot_counter -= 1 total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) print(log_info) sys.stdout.flush() log_out_file.write(log_info) log_out_file.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.write( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.flush() sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args_iko.cuda else "cpu") if args_iko.vis: from visdom import Visdom viz = Visdom(port=args_iko.port) win = None envs = make_vec_envs(args_iko.env_name, args_iko.seed, args_iko.num_processes, args_iko.gamma, args_iko.log_dir, args_iko.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_iko.recurrent_policy}) actor_critic.to(device) action_shape = 3 reward_model = RewardModel(11 * 11 * 6, 1, 64, 64) reward_model.to(device) if args_iko.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, lr=args_iko.lr, eps=args_iko.eps, alpha=args_iko.alpha, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'ppo': agent = algo.PPO(actor_critic, args_iko.clip_param, args_iko.ppo_epoch, args_iko.num_mini_batch, args_iko.value_loss_coef, args_iko.entropy_coef, args_iko.use_singh, reward_model, lr=args_iko.lr, eps=args_iko.eps, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, acktr=True) rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args_iko.use_linear_lr_decay: # decrease learning rate linearly if args_iko.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args_iko.lr) if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay: agent.clip_param = args_iko.clip_param * (1 - j / float(num_updates)) reward_train = [] reward_block_penalty = [] reward_bel_gt = [] reward_bel_gt_nonlog = [] reward_infogain = [] reward_bel_ent = [] reward_hit = [] reward_dist = [] reward_inv_dist = [] for step in range(args_iko.num_steps): # Sample actions # print(step, args_iko.num_steps) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) reward_train.append(reward) # print("infos is ", infos) # reward_b.append(infos[0]['auxillary_reward']) # print("infos is ",infos[0]['auxillary_reward']) reward_block_penalty.append(infos[0]['reward_block_penalty']) reward_bel_gt.append(infos[0]['reward_bel_gt']) reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog']) reward_infogain.append(infos[0]['reward_infogain']) reward_bel_ent.append(infos[0]['reward_bel_ent']) reward_hit.append(infos[0]['reward_hit']) reward_dist.append(infos[0]['reward_dist']) reward_inv_dist.append(infos[0]['reward_inv_dist']) # print(reward) reward.to(device) reward_model.to(device) if args_iko.use_singh: # print("using learning IR") my_reward = reward_model(obs.clone().to(device), action.clone().float()).detach() my_reward.to(device) reward = reward + args_iko.singh_coef * my_reward.type( torch.FloatTensor) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # print("infos is ",infos[0]['auxillary_reward']) # print("info is",info['episode']['r'] ) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # print("mean reward_a", np.mean(reward_train)) # print("mean reward_block_penalty", np.mean(reward_block_penalty)) # print("mean reward_bel_gt", np.mean(reward_bel_gt)) # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog)) # print("mean reward_infogain", np.mean(reward_infogain)) # print("mean reward_bel_ent", np.mean(reward_bel_ent)) # print("mean reward_hit", np.mean(reward_hit)) # print("mean reward_dist", np.mean(reward_dist)) # print("mean reward_inv_dist", np.mean(reward_inv_dist)) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps writer.add_scalar('mean_reward_train', np.mean(reward_train), total_num_steps) writer.add_scalar('mean_reward_block_penalty', np.mean(reward_block_penalty), total_num_steps) writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt), total_num_steps) writer.add_scalar('mean_reward_bel_gt_nonlog', np.mean(reward_bel_gt_nonlog), total_num_steps) writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain), total_num_steps) writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent), total_num_steps) writer.add_scalar('mean_reward_hit', np.mean(reward_hit), total_num_steps) writer.add_scalar('mean_reward_dist', np.mean(reward_dist), total_num_steps) writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist), total_num_steps) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma, args_iko.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args_iko.save_interval == 0 or j == num_updates - 1) and args_iko.save_dir != "": save_path = os.path.join(args_iko.save_dir, args_iko.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args_iko.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, 'ugl' + str(args_iko.use_gt_likelihood) + 'block-pen-' + str(args_iko.penalty_for_block) + '_' + 'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' + str(args_iko.rew_bel_new) + '_' + 'bel-ent-' + str(args_iko.rew_bel_ent) + '_' + 'infogain-' + str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' + str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' + str(args_iko.rew_bel_gt) + '_' + 'dist-' + str(args_iko.rew_dist) + '_' + 'hit-' + str(args_iko.rew_hit) + '_' + 'inv-dist-' + str(args_iko.rew_inv_dist) + args_iko.algo + ".pt")) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps if j % args_iko.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("mean reward_a", np.mean(reward_a)) print("mean_reward_b", np.mean(reward_b)) # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # len(episode_rewards), # np.mean(episode_rewards), # np.median(episode_rewards), # np.min(episode_rewards), # np.max(episode_rewards), dist_entropy, # value_loss, action_loss)) # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps) # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps) # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps) if (args_iko.eval_interval is not None and len(episode_rewards) > 1 and j % args_iko.eval_interval == 0): eval_envs = make_vec_envs(args_iko.env_name, args_iko.seed + args_iko.num_processes, args_iko.num_processes, args_iko.gamma, eval_log_dir, args_iko.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args_iko.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args_iko.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args_iko.vis and j % args_iko.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args_iko.log_dir, args_iko.env_name, args_iko.algo, args_iko.num_env_steps) except IOError: pass writer.close()
def main(args): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) eval_log_dir = args.log_dir + "_eval" try: os.makedirs(eval_log_dir) except OSError: files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' if args.eval_render: render_env = make_vec_envs(args.env_name, args.seed, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) torch.set_num_threads(1) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Uses gpu/cuda by default device = torch.device("cuda:0" if args.cuda else "cpu") # Only if running visdoom if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # Set up actor_critic actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # Set algorithm with actor critic and use to learn if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, args.env_name + "-AvgRwrd" + str(int(np.mean(episode_rewards))) + ".pt")) print("Saving Model") total_num_steps = (j + 1) * args.num_processes * args.num_steps # Logs every log_interval steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) if args.eval_render: show_model(render_env, actor_critic) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(env, scene_path): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) save_path = os.path.join(args.save_dir, args.algo) eval_x = [] eval_y = [] torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") initial_policies = torch.load(os.path.join(args.load_dir, args.algo, args.initial_policy + ".pt")) \ if args.initial_policy else None if args.reuse_residual: residual, ob_rms, initial_policies = initial_policies else: residual = None ob_rms = None pose_estimator = torch.load(os.path.join(args.load_dir, "pe", args.pose_estimator + ".pt")) \ if args.pose_estimator else None envs = make_vec_envs(env, scene_path, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, initial_policies, pose_estimator=pose_estimator, init_control=not args.dense_ip) if args.reuse_residual: vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms base_kwargs = {'recurrent': args.recurrent_policy} base = residual.base if args.reuse_residual else None dist = residual.dist if args.reuse_residual else None actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs=base_kwargs, zero_last_layer=True, base=base, dist=dist) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, burn_in=initial_policies is not None and not args.reuse_residual) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=64) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes total_num_steps = 0 j = 0 max_succ = -1 max_mean_rew = -math.inf mean_ep_rew = -math.inf evals_without_improv = 0 start = time.time() start_update = start while (not use_metric and j < num_updates) or (use_metric and max_succ < args.trg_succ_rate): if args.eval_interval is not None and j % args.eval_interval == 0: print("Evaluating current policy...") i = 0 total_successes = 0 max_trials = 50 eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while i + args.num_processes <= max_trials: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) obs, _, dones, infos = envs.step(action) if np.all(dones): # Rigid - assumes episodes are fixed length rews = [] for info in infos: rews.append(info['rew_success']) i += args.num_processes rew = sum([int(rew > 0) for rew in rews]) total_successes += rew p_succ = (100 * total_successes / i) eval_x += [total_num_steps] eval_y += [p_succ] end = time.time() print( f"Evaluation: {total_successes} successful out of {i} episodes - " f"{p_succ:.2f}% successful. Eval length: {end - start_update}") torch.save([eval_x, eval_y], os.path.join(args.save_as + "_eval.pt")) start_update = end if p_succ > max_succ: max_succ = p_succ max_mean_rew = mean_ep_rew evals_without_improv = 0 elif mean_ep_rew > max_mean_rew: print("Unimproved success rate, higher reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 else: evals_without_improv += 1 if evals_without_improv == 10 or max_succ >= args.trg_succ_rate: save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] extra = "_final" if evals_without_improv == 5 else "" torch.save( save_model, os.path.join(save_path, args.save_as + f"{extra}.pt")) break # save for every interval-th episode or for the last epoch if ((not use_metric and (j % args.save_interval == 0 or j == num_updates - 1)) or (use_metric and evals_without_improv == 0)) and args.save_dir != "": os.makedirs(save_path, exist_ok=True) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() if pose_estimator is not None: save_model = [save_model, pose_estimator, initial_policies] else: save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] torch.save(save_model, os.path.join(save_path, args.save_as + ".pt")) # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt")) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: mean_ep_rew = np.mean(episode_rewards) if mean_ep_rew > max_mean_rew: print("Improved max mean reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), mean_ep_rew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print("Update length: ", end - start_update) start_update = end if args.vis and (j % args.vis_interval == 0 or (not use_metric and j == num_updates - 1)): try: # Sometimes monitor doesn't properly flush the outputs visdom_plot(args.log_dir, args.save_as, args.algo, total_num_steps) except IOError: pass j += 1 if use_metric: if max_succ >= args.trg_succ_rate: print( f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum." ) else: print( f"Policy converged with max success rate < {args.trg_succ_rate}%" ) # Copy logs to permanent location so new graphs can be drawn. copy_tree(args.log_dir, os.path.join('logs', args.save_as)) envs.close() return total_num_steps
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print(device) print(save_folder) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, args.reward_type) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) curiosity = None if use_curiosity: curiosity = ICM(envs.observation_space.shape[0], envs.action_space.n) curiosity.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, curiosity, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_curiosity=use_curiosity) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() cum_rew = [0] * args.num_processes rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=args.num_processes * 2) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) envs.render() cur_reward = reward to_write = reward.cpu().numpy() for i in range(args.num_processes): cum_rew[i] += to_write[i][0] if use_curiosity: action_one_hot = (torch.eye(14)[action]).view(-1, 14).cuda() _, pred_phi, actual_phi = curiosity( (rollouts.obs[step], obs, action_one_hot)) cur_reward += 0.2 * ((pred_phi - actual_phi).pow(2)).sum( -1, keepdim=True).cpu() / 2 for i, finished in enumerate(done): if finished: percentile = infos[i]['x_pos'] / norm_pos episode_rewards.append(percentile) print(cum_rew[i]) with open(train_file[:-4] + str(i) + train_file[-4:], 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[cum_rew[i], percentile]]) cum_rew[i] = 0 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, cur_reward.detach(), masks) with torch.no_grad(): next_value = agent.actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if args.cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_folder, '/' + args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len( episode_rewards) > args.num_processes: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, cumulative reward {:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.mean(cum_rew))) #Evaluation time : if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): num_proc = 1 eval_envs = make_vec_envs(args.env_name, args.seed + num_proc, num_proc, args.gamma, args.log_dir, args.add_timestep, device, True, args.reward_type) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] test_rew = 0 finish_this = False obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_proc, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_proc, 1, device=device) positions = deque(maxlen=400) while not finish_this: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_envs.render() eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).cuda() # for i, finished in enumerate(done): # if finished: # percentile = infos[i]['x_pos']/norm_pos # eval_episode_rewards.append(percentile) # with open(eval_file, 'a', newline='') as sfile: # writer = csv.writer(sfile) # writer.writerows([[percentile]]) test_rew += reward.cpu().numpy()[0, 0] for i, finished in enumerate(done): if finished: print('he died') percentile = infos[i]['x_pos'] / norm_pos eval_episode_rewards.append(percentile) with open(eval_file, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[test_rew, percentile]]) finish_this = True #to prevent the agent from getting stuck positions.append(infos[0]['x_pos']) pos_ar = np.array(positions) if (len(positions) >= 200) and (pos_ar < pos_ar[-1] + 20).all( ) and (pos_ar > pos_ar[-1] - 20).all(): print("he's stuck") percentile = infos[0]['x_pos'] / norm_pos eval_episode_rewards.append(percentile) with open(eval_file, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[test_rew, percentile]]) finish_this = True eval_envs.close() positions.clear() print( " Evaluation using {} episodes: reward {:.3f}, distance {:.3f}\n" .format(len(eval_episode_rewards), test_rew, np.mean(eval_episode_rewards))) test_rew = 0 finish_this = False if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, 1, args.gamma, args.log_dir, args.add_timestep, device, False) # Determine the observation and action lengths for the robot and human, respectively obs = envs.reset() action = torch.tensor([envs.action_space.sample()]) _, _, _, info = envs.step(action) obs_robot_len = info[0]['obs_robot_len'] obs_human_len = info[0]['obs_human_len'] action_robot_len = info[0]['action_robot_len'] action_human_len = info[0]['action_human_len'] obs_robot = obs[:, :obs_robot_len] obs_human = obs[:, obs_robot_len:] if len(obs_robot[0]) != obs_robot_len or len( obs_human[0]) != obs_human_len: print('robot obs shape:', obs_robot.shape, 'obs space robot shape:', (obs_robot_len, )) print('human obs shape:', obs_human.shape, 'obs space human shape:', (obs_human_len, )) exit() envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # Reset environment obs = envs.reset() obs_robot = obs[:, :obs_robot_len] obs_human = obs[:, obs_robot_len:] action_space_robot = spaces.Box(low=np.array([-1.0] * action_robot_len), high=np.array([1.0] * action_robot_len), dtype=np.float32) action_space_human = spaces.Box(low=np.array([-1.0] * action_human_len), high=np.array([1.0] * action_human_len), dtype=np.float32) if args.load_policy is not None: actor_critic_robot, actor_critic_human, ob_rms = torch.load( args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic_robot = Policy( [obs_robot_len], action_space_robot, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic_human = Policy( [obs_human_len], action_space_human, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic_robot.to(device) actor_critic_human.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent_robot = algo.PPO(actor_critic_robot, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent_human = algo.PPO(actor_critic_human, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts_robot = RolloutStorage( args.num_steps, args.num_processes, [obs_robot_len], action_space_robot, actor_critic_robot.recurrent_hidden_state_size) rollouts_human = RolloutStorage( args.num_steps, args.num_processes, [obs_human_len], action_space_human, actor_critic_human.recurrent_hidden_state_size) rollouts_robot.obs[0].copy_(obs_robot) rollouts_robot.to(device) rollouts_human.obs[0].copy_(obs_human) rollouts_human.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent_robot.optimizer, j, num_updates, args.lr) update_linear_schedule(agent_human.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent_robot.clip_param = args.clip_param * (1 - j / float(num_updates)) agent_human.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value_robot, action_robot, action_log_prob_robot, recurrent_hidden_states_robot = actor_critic_robot.act( rollouts_robot.obs[step], rollouts_robot.recurrent_hidden_states[step], rollouts_robot.masks[step]) value_human, action_human, action_log_prob_human, recurrent_hidden_states_human = actor_critic_human.act( rollouts_human.obs[step], rollouts_human.recurrent_hidden_states[step], rollouts_human.masks[step]) # Obser reward and next obs action = torch.cat((action_robot, action_human), dim=-1) obs, reward, done, infos = envs.step(action) obs_robot = obs[:, :obs_robot_len] obs_human = obs[:, obs_robot_len:] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts_robot.insert(obs_robot, recurrent_hidden_states_robot, action_robot, action_log_prob_robot, value_robot, reward, masks) rollouts_human.insert(obs_human, recurrent_hidden_states_human, action_human, action_log_prob_human, value_human, reward, masks) with torch.no_grad(): next_value_robot = actor_critic_robot.get_value( rollouts_robot.obs[-1], rollouts_robot.recurrent_hidden_states[-1], rollouts_robot.masks[-1]).detach() next_value_human = actor_critic_human.get_value( rollouts_human.obs[-1], rollouts_human.recurrent_hidden_states[-1], rollouts_human.masks[-1]).detach() rollouts_robot.compute_returns(next_value_robot, args.use_gae, args.gamma, args.tau) rollouts_human.compute_returns(next_value_human, args.use_gae, args.gamma, args.tau) value_loss_robot, action_loss_robot, dist_entropy_robot = agent_robot.update( rollouts_robot) value_loss_human, action_loss_human, dist_entropy_human = agent_human.update( rollouts_human) rollouts_robot.after_update() rollouts_human.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model_robot = actor_critic_robot save_model_human = actor_critic_human if args.cuda: save_model_robot = copy.deepcopy(actor_critic_robot).cpu() save_model_human = copy.deepcopy(actor_critic_human).cpu() save_model = [ save_model_robot, save_model_human, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Robot/Human updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy_robot, value_loss_robot, action_loss_robot)) sys.stdout.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() obs_robot = obs[:, :obs_robot_len] obs_human = obs[:, obs_robot_len:] eval_recurrent_hidden_states_robot = torch.zeros( args.num_processes, actor_critic_robot.recurrent_hidden_state_size, device=device) eval_recurrent_hidden_states_human = torch.zeros( args.num_processes, actor_critic_human.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action_robot, _, eval_recurrent_hidden_states_robot = actor_critic_robot.act( obs_robot, eval_recurrent_hidden_states_robot, eval_masks, deterministic=True) _, action_human, _, eval_recurrent_hidden_states_human = actor_critic_human.act( obs_human, eval_recurrent_hidden_states_human, eval_masks, deterministic=True) # Obser reward and next obs action = torch.cat((action_robot, action_human), dim=-1) obs, reward, done, infos = eval_envs.step(action) obs_robot = obs[:, :obs_robot_len] obs_human = obs[:, obs_robot_len:] eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs, is_minigrid = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, args.num_frame_stack) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy, 'est_beta_value':args.est_beta_value}, is_minigrid=is_minigrid, use_rew=args.use_reward) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_beta=args.lr_beta, reg_beta=args.reg_beta, delib_center=args.delib_center, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() obs = obs + torch.randn_like(obs) * args.noise_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) all_rewards_local = deque() all_frame_local = deque() all_beta_mean_local = deque() all_beta_std_local = deque() start = time.time() prev_value = 0 eval_prev_value = 0 prev_rew = torch.zeros(args.num_processes).to(device) eval_prev_rew = torch.zeros(args.num_processes).to(device) for j in range(num_updates): beta_value_list = [] if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, prev_value, beta_value = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], prev_value=prev_value, prev_rew=prev_rew) # Obser reward and next obs obs, reward, done, infos = envs.step(action) obs = obs + torch.randn_like(obs) * args.noise_obs if not args.cuda: beta_value_list.append(beta_value.numpy()) else: beta_value_list.append(beta_value.cpu().numpy()) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) prev_rew = reward * masks rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, eval_prev_value, beta_loss, eval_prev_rew = agent.update(rollouts, eval_prev_value, eval_prev_rew) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name +"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) experiment.log_metrics({"mean reward": np.mean(episode_rewards), "Value loss": value_loss, "Action Loss": action_loss, "Beta loss": beta_loss, "Beta mean": np.array(beta_value_list).mean(), "Beta std": np.array(beta_value_list).std()}, step=j * args.num_steps * args.num_processes) all_rewards_local.append(np.mean(episode_rewards)) all_frame_local.append(total_num_steps) all_beta_mean_local.append(np.array(beta_value_list).mean()) all_beta_std_local.append(np.array(beta_value_list).std()) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass with open(args.save_local_dir+"Rewards_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f: pickle.dump(all_rewards_local, f) with open(args.save_local_dir+"Frames_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f: pickle.dump(all_frame_local, f) with open(args.save_local_dir+"beta_mean_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f: pickle.dump(all_beta_mean_local, f) with open(args.save_local_dir+"beta_std_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f: pickle.dump(all_beta_std_local, f)
def main(): setup_logger(args.verbose, args.model_name) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.visdom_port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, config.log_directory, args.add_timestep, device, allow_early_resets=True, num_frame_stack=None, ip=args.ip, start_port=args.port, wait_action=args.wait_action, reset_step=args.reset_step) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_layer_size }) # load model if args.load_path is not None: logger.info("loading model: {}".format(args.load_path)) actor_critic = torch.load(args.load_path) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_clipped_value_loss=True) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=20) episode_distance = deque(maxlen=20) if args.use_wandb: wandb.init(project='LumbarSpine', config=args, group=args.model_name, resume=args.resume_wandb) # wandb.watch(actor_critic) if wandb.run.resumed: logger.info('Wandb resumed!') # --------------------- train ---------------------------- start = time.time() for iter in range(num_updates): logger.info('Training {}/{} updates'.format(iter, num_updates)) if args.test: break # todo: maybe this is what is making things confusing?! envs.reset() # HACKY way of reconnecting back the main env to avoid packet drop # for i in range(args.num_processes): # envs.venv.envs[i].env.net.connect(args.ip, args.port) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py lr = update_linear_schedule(agent.optimizer, iter, num_updates, agent.optimizer.lr) else: lr = update_linear_schedule(agent.optimizer, iter, num_updates, args.lr) else: lr = args.lr if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - iter / float(num_updates)) distances = [] vels = [] for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_distance.append(info['episode_']['distance']) if 'distance' in info.keys(): distances.append(info['distance']) vels.append(info['vel']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if iter % args.save_interval == 0 or iter == num_updates - 1: save_path = os.path.join(config.trained_directory, args.algo + "-" + args.env_name + ".pt") logger.info("Saving model: {}".format(save_path)) torch.save(actor_critic, save_path) total_num_steps = (iter + 1) * args.num_processes * args.num_steps log_info = { 'average_vel': np.mean(vels), 'average_distance': np.mean(distances), 'value_loss': value_loss, 'action_loss': action_loss, 'dist_entropy': dist_entropy, 'lr': lr, 'agent_clip_param': agent.clip_param, } if len(episode_rewards) > 1: log_info.update({ 'mean_episode_reward': np.mean(episode_rewards), 'median_episode_reward': np.median(episode_rewards), 'min_episode_reward': np.min(episode_rewards), 'max_episode_reward': np.max(episode_rewards), 'mean_episode_distance': np.mean(episode_distance) }) # todo: switch to episodic and cover other locations. This log is only for episodic if iter % args.episode_log_interval == 0 and len(episode_rewards) > 1: end = time.time() logger.info( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median " "reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".format( iter, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) # --------------------- evaluate ---------------------------- # Evaluate on a single environment if args.eval_interval is not None and iter % args.eval_interval == 0: logger.info('Evaluate') # todo: what is allow_early_resets? (False for main, True for eval) eval_envs = make_vec_envs( args.env_name, args.seed, 1, # args.num_processes, args.gamma, config.log_directory, args.add_timestep, device, allow_early_resets=False, num_frame_stack=None, ip=args.ip, start_port=args.port, wait_action=args.wait_action, eval_mode=True) eval_episode_rewards = [] rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) eval_distances = [] # while len(eval_episode_rewards) < 10: for eval_step in range(args.num_steps_eval): with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) logger.log(msg='eval step reward: {}'.format(reward), level=18) logger.log(msg='eval step obs: {}'.format(obs), level=18) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) if 'distance' in info.keys(): eval_distances.append(info['distance']) rewards.extend(reward) eval_envs.close() if args.episodic: logger.info( "Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) else: logger.info( "Evaluation using {} steps: mean reward {:.5f}\n".format( args.num_steps_eval, np.mean(rewards))) # update info log_info.update({ 'mean_eval_reward': np.mean(rewards), 'eval_average_distance': np.mean(eval_distances) }) if args.vis and iter % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs logger.info("Visdom log update") win = visdom_plot(viz, win, config.visdom_log_directory, args.env_name, args.algo, args.num_env_steps) except IOError: pass if iter % args.log_interval == 0: logger.info('{}:{} {}'.format(iter, num_updates, log_info)) if args.use_wandb: wandb.log(log_info) # -------------------------------------- testing ------------------------------------- if args.test: logger.info('Evaluate') # todo: what is allow_early_resets? (False for main, True for eval) eval_envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, config.log_directory, args.add_timestep, device, allow_early_resets=False, num_frame_stack=None, ip=args.ip, start_port=args.port, wait_action=args.wait_action, eval_mode=True) eval_episode_rewards = [] rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) # while len(eval_episode_rewards) < 10: for eval_step in range(args.num_steps_eval): with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) logger.info('eval step reward: {}'.format(reward)) logger.log(msg='eval step obs: {}'.format(obs), level=18) if args.episodic: for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) else: rewards.append(reward) eval_envs.close() if args.episodic: logger.info( "Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) else: logger.info( "Evaluation using {} steps: mean reward {:.5f}\n".format( args.num_steps, np.mean(rewards)))