def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, args.custom_gym) base = SEVN actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'ppo': agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() writer.add_scalars('Train/Episode Reward', { "Reward Mean": np.mean(episode_rewards), "Reward Min": np.min(episode_rewards), "Reward Max": np.max(episode_rewards) }, global_step=total_num_steps) writer.add_scalars('Train/Episode Length', { "Episode Length Mean": np.mean(episode_length), "Episode Length Min": np.min(episode_length), "Episode Length Max": np.max(episode_length) }, global_step=total_num_steps) writer.add_scalar("Train/Episode Reward Mean", np.mean(episode_rewards), global_step=total_num_steps) writer.add_scalar("Train/Episode Length Mean", np.mean(episode_length), global_step=total_num_steps) writer.add_scalar("Train/Episode Success Rate", np.mean(episode_success_rate), global_step=total_num_steps) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len, S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, end_of_video, cdn_newest_id, download_id, cdn_has_frame, IntialVars): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # The online env in AItrans, it should have the observation space, action space and so on # We should step into the depth of envs.py in the github doc, and extract the format of observation # and action space envs = actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # choose the algorithm, now we only have a2c if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # the initial observation obs = rollouts.obs[0].copy_(obs) rollouts.to(device) episode_reward = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step])
def main(args, idx): # Create summary writer writer_path = os.path.join(args.log_dir, args.task_id, args.run_id + '-' + str(idx)) writer = SummaryWriter(log_dir=writer_path) # Create training envs envs = make_vec_envs(args.task_id, args.seed, args.num_processes, args.gamma, args.monitor_dir, args.device) obs_size = envs.observation_space.shape[0] act_size = envs.action_space.shape[0] # Create NN actor_critic = Policy(obs_size, act_size, action_range=[envs.action_space.low[0], envs.action_space.high[0]]) actor_critic.to(args.device) # Create ppo agent agent = PPO( actor_critic=actor_critic, device=args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, clip_param=args.clip_param, ppo_epoch=args.ppo_epoch, num_mini_batch=args.num_mini_batch, value_loss_coef=args.value_loss_coef, entropy_coef=args.entropy_coef, ) # Create replay buffer buffer = ReplayBuffer(args.num_steps, args.num_processes, obs_size, act_size) buffer.to(args.device) # Reset envs obs = envs.reset() buffer.obs[0].copy_(obs) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes for j in tqdm(range(num_updates)): if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) # Collect trajectories and compute returns with torch.no_grad(): for step in range(args.num_steps): # Sample actions action = actor_critic(buffer.obs[step]) # Get trajectories from envs obs, reward, done, infos = envs.step(action) mask = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float, device=args.device) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # Store trajectories buffer.insert(obs, action, reward, mask) # Compute returns batch_obs = buffer.obs.view(-1, obs_size) value = actor_critic.get_value(batch_obs).view(args.num_steps + 1, args.num_processes, 1) batch_obs = buffer.obs[:-1].view(-1, obs_size) batch_action = buffer.actions.view(-1, act_size) action_log_prob = actor_critic.get_act_log_prob(batch_obs, batch_action).view(args.num_steps, args.num_processes, 1) buffer.update_value_log_prob(value, action_log_prob) buffer.compute_returns(args.gamma, args.gae_lambda) # Update policy agent_output = agent.update(buffer) buffer.after_update() # Log stuff if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() speed = int(total_num_steps / (end - start)) print( "Updates {}, num timesteps {}, FPS {} \n " "Last {} training episodes: mean/median reward {:.1f}/{:.1f}, " "min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, speed, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), )) writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) writer.add_scalar('speed', speed, total_num_steps) for key in agent_output.keys(): writer.add_scalar(key, agent_output[key], total_num_steps) if args.task_id == 'Pendulum-v0' and np.mean(episode_rewards) > -250: break envs.close() writer.close()
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # get cloned policy and recovered reward function policy_reward_dir = args.rewards_dir policy_dir = args.policies_dir policy_reward = Policy(envs.observation_space.shape, envs.action_space) policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth' policy_reward_sd = torch.load(policy_reward_file_name) policy_reward.load_state_dict(policy_reward_sd) actor_critic = Policy(envs.observation_space.shape, envs.action_space) policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth' policy_sd = torch.load(policy_file_name) actor_critic.load_state_dict(policy_sd) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, _, done, infos = envs.step(action) if step > 1 and step % 1000 == 0: done = True # use infered reward: with torch.no_grad(): # _, reward = shapes(rollouts.obs[step], 0) _, action_log_probs, _, _ = policy_reward.evaluate_actions( rollouts.obs[step], None, None, action) reward = action_log_probs for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) r = 0 for key, val in info.items(): if 'reward' in key: r += val episode_rewards.append(r) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act( obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args_iko.cuda else "cpu") if args_iko.vis: from visdom import Visdom viz = Visdom(port=args_iko.port) win = None envs = make_vec_envs(args_iko.env_name, args_iko.seed, args_iko.num_processes, args_iko.gamma, args_iko.log_dir, args_iko.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_iko.recurrent_policy}) actor_critic.to(device) action_shape = 3 reward_model = RewardModel(11 * 11 * 6, 1, 64, 64) reward_model.to(device) if args_iko.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, lr=args_iko.lr, eps=args_iko.eps, alpha=args_iko.alpha, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'ppo': agent = algo.PPO(actor_critic, args_iko.clip_param, args_iko.ppo_epoch, args_iko.num_mini_batch, args_iko.value_loss_coef, args_iko.entropy_coef, args_iko.use_singh, reward_model, lr=args_iko.lr, eps=args_iko.eps, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, acktr=True) rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args_iko.use_linear_lr_decay: # decrease learning rate linearly if args_iko.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args_iko.lr) if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay: agent.clip_param = args_iko.clip_param * (1 - j / float(num_updates)) reward_train = [] reward_block_penalty = [] reward_bel_gt = [] reward_bel_gt_nonlog = [] reward_infogain = [] reward_bel_ent = [] reward_hit = [] reward_dist = [] reward_inv_dist = [] for step in range(args_iko.num_steps): # Sample actions # print(step, args_iko.num_steps) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) reward_train.append(reward) # print("infos is ", infos) # reward_b.append(infos[0]['auxillary_reward']) # print("infos is ",infos[0]['auxillary_reward']) reward_block_penalty.append(infos[0]['reward_block_penalty']) reward_bel_gt.append(infos[0]['reward_bel_gt']) reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog']) reward_infogain.append(infos[0]['reward_infogain']) reward_bel_ent.append(infos[0]['reward_bel_ent']) reward_hit.append(infos[0]['reward_hit']) reward_dist.append(infos[0]['reward_dist']) reward_inv_dist.append(infos[0]['reward_inv_dist']) # print(reward) reward.to(device) reward_model.to(device) if args_iko.use_singh: # print("using learning IR") my_reward = reward_model(obs.clone().to(device), action.clone().float()).detach() my_reward.to(device) reward = reward + args_iko.singh_coef * my_reward.type( torch.FloatTensor) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # print("infos is ",infos[0]['auxillary_reward']) # print("info is",info['episode']['r'] ) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # print("mean reward_a", np.mean(reward_train)) # print("mean reward_block_penalty", np.mean(reward_block_penalty)) # print("mean reward_bel_gt", np.mean(reward_bel_gt)) # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog)) # print("mean reward_infogain", np.mean(reward_infogain)) # print("mean reward_bel_ent", np.mean(reward_bel_ent)) # print("mean reward_hit", np.mean(reward_hit)) # print("mean reward_dist", np.mean(reward_dist)) # print("mean reward_inv_dist", np.mean(reward_inv_dist)) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps writer.add_scalar('mean_reward_train', np.mean(reward_train), total_num_steps) writer.add_scalar('mean_reward_block_penalty', np.mean(reward_block_penalty), total_num_steps) writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt), total_num_steps) writer.add_scalar('mean_reward_bel_gt_nonlog', np.mean(reward_bel_gt_nonlog), total_num_steps) writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain), total_num_steps) writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent), total_num_steps) writer.add_scalar('mean_reward_hit', np.mean(reward_hit), total_num_steps) writer.add_scalar('mean_reward_dist', np.mean(reward_dist), total_num_steps) writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist), total_num_steps) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma, args_iko.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args_iko.save_interval == 0 or j == num_updates - 1) and args_iko.save_dir != "": save_path = os.path.join(args_iko.save_dir, args_iko.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args_iko.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, 'ugl' + str(args_iko.use_gt_likelihood) + 'block-pen-' + str(args_iko.penalty_for_block) + '_' + 'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' + str(args_iko.rew_bel_new) + '_' + 'bel-ent-' + str(args_iko.rew_bel_ent) + '_' + 'infogain-' + str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' + str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' + str(args_iko.rew_bel_gt) + '_' + 'dist-' + str(args_iko.rew_dist) + '_' + 'hit-' + str(args_iko.rew_hit) + '_' + 'inv-dist-' + str(args_iko.rew_inv_dist) + args_iko.algo + ".pt")) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps if j % args_iko.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("mean reward_a", np.mean(reward_a)) print("mean_reward_b", np.mean(reward_b)) # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # len(episode_rewards), # np.mean(episode_rewards), # np.median(episode_rewards), # np.min(episode_rewards), # np.max(episode_rewards), dist_entropy, # value_loss, action_loss)) # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps) # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps) # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps) if (args_iko.eval_interval is not None and len(episode_rewards) > 1 and j % args_iko.eval_interval == 0): eval_envs = make_vec_envs(args_iko.env_name, args_iko.seed + args_iko.num_processes, args_iko.num_processes, args_iko.gamma, eval_log_dir, args_iko.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args_iko.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args_iko.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args_iko.vis and j % args_iko.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args_iko.log_dir, args_iko.env_name, args_iko.algo, args_iko.num_env_steps) except IOError: pass writer.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
agent.rollouts.obs[0].copy_(obs[1]) agent.rollouts.to(device) # start training agent.train() start = time.time() num_updates = int(args.num_env_steps // args.num_processes // args.num_steps) for update in range(num_updates): # decrease learning rate linearly if args.use_linear_lr_decay: if args.share_optim: utils.update_linear_schedule(optimizer=agent.optimizer, update=update, total_num_updates=num_updates, initial_lr=args.pi_lr) else: utils.update_linear_schedule(optimizer=agent.policy_optimizer, update=update, total_num_updates=num_updates, initial_lr=args.pi_lr) utils.update_linear_schedule( optimizer=agent.value_fn_optimizer, update=update, total_num_updates=num_updates, initial_lr=args.v_lr) extrinsic_rewards = [] episode_length = []
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") run_id = "alpha{}".format(args.gcn_alpha) if args.use_logger: from utils import Logger folder = "{}/{}".format(args.folder, run_id) logger = Logger(algo_name=args.algo, environment_name=args.env_name, folder=folder, seed=args.seed) logger.save_args(args) print("---------------------------------------") print('Saving to', logger.save_folder) print("---------------------------------------") else: print("---------------------------------------") print('NOTE : NOT SAVING RESULTS') print("---------------------------------------") all_rewards = [] envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.env_name, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.output_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ############################ # GCN Model and optimizer from pygcn.train import update_graph from pygcn.models import GCN, GAT, SAGE assert args.gnn in ['gcn', 'gat', 'sage'] if args.gnn == 'gat': gcn_model = GAT(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'sage': gcn_model = SAGE(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'gcn': gcn_model = GCN(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) gcn_model.to(device) gcn_optimizer = optim.Adam(gcn_model.parameters(), lr=args.gcn_lr, weight_decay=args.gcn_weight_decay) gcn_loss = nn.NLLLoss() gcn_states = [[] for _ in range(args.num_processes)] Gs = [nx.Graph() for _ in range(args.num_processes)] node_ptrs = [0 for _ in range(args.num_processes)] rew_states = [[] for _ in range(args.num_processes)] ############################ episode_rewards = deque(maxlen=100) avg_fwdloss = deque(maxlen=100) rew_rms = RunningMeanStd(shape=()) delay_rew = torch.zeros([args.num_processes, 1]) delay_step = torch.zeros([args.num_processes]) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob,\ recurrent_hidden_states, hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) delay_rew += reward delay_step += 1 for idx, (info, hid, eps_done) in enumerate(zip(infos, hidden_states, done)): if eps_done or delay_step[idx] == args.reward_freq: reward[idx] = delay_rew[idx] delay_rew[idx] = delay_step[idx] = 0 else: reward[idx] = 0 if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if args.gcn_alpha < 1.0: gcn_states[idx].append(hid) node_ptrs[idx] += 1 if not eps_done: Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx]) if reward[idx] != 0. or eps_done: rew_states[idx].append( [node_ptrs[idx] - 1, reward[idx]]) if eps_done: adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\ else sp.csr_matrix(np.eye(1,dtype='int64')) update_graph(gcn_model, gcn_optimizer, torch.stack(gcn_states[idx]), adj, rew_states[idx], gcn_loss, args, envs) gcn_states[idx] = [] Gs[idx] = nx.Graph() node_ptrs[idx] = 0 rew_states[idx] = [] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, hidden_states) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau, gcn_model, args.gcn_alpha) agent.update(rollouts) rollouts.after_update() ####################### Saving and book-keeping ####################### if (j % int(num_updates / 5.) == 0 or j == num_updates - 1) and args.save_dir != "": print('Saving model') print() save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id) save_path = os.path.join(save_dir, args.algo, 'seed' + str(args.seed)) + '_iter' + str(j) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_gcn = gcn_model if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_gcn = copy.deepcopy(gcn_model).cpu() save_model = [ save_gcn, save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + "ac.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {}\ training episodes: mean/median reward {:.2f}/{:.2f},\ min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards), )) all_rewards.append(np.mean(episode_rewards)) if args.use_logger: logger.save_task_results(all_rewards) ####################### Saving and book-keeping ####################### envs.close()
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) print('here') if args.env_name == 'Reacher-v2': rbf1 = build_features_reacher2(.2, 5, 2) len_rbf = rbf1._K len_features = len_rbf + 1 if args.env_name == 'Hopper-v2': len_features = 3 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, len_features) print('here2') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) num_updates = 20 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # Prepare demos demo_actions = np.zeros( (1, args.num_processes, envs.action_space.shape[0])) demo_states = np.zeros( (1, args.num_processes, envs.observation_space.shape[0])) demo_features = np.zeros((1, args.num_processes, len_features)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # obs, reward and next obs demo_actions = np.concatenate( [demo_actions, action.reshape(1, args.num_processes, -1)], 0) demo_states = np.concatenate([ demo_states, rollouts.obs[step].reshape( 1, args.num_processes, -1) ], 0) feat_rewards = np.zeros((args.num_processes, len_features)) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_before = envs.get_sim_data() obs, reward, done, infos = envs.step(action) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_after = envs.get_sim_data() for num_p in range(args.num_processes): feat_1 = pos_after[num_p] - pos_before[num_p] feat_2 = 0 if not done[num_p]: feat_2 = 1 # feat_2 = np.array([1 for _ in range(args.num_processes)]) feat_3 = np.array( [np.linalg.norm(action[num_p], ord=2)**2]).flatten() feat_rewards[num_p] = np.array( [feat_1, feat_2, feat_3]) if args.env_name == 'Reacher-v2': if args.num_processes > 1: body_data = envs.get_body_data() for num_p in range(args.num_processes): rbf1_ = rbf1(body_data[num_p][:-1]) rbf4_ = np.array( [np.linalg.norm(action[num_p], ord=2)**2]) feat_rewards[num_p] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) else: rbf1_ = rbf1( (envs.envs[0].env.env.get_body_com("fingertip") - envs.envs[0].env.env.get_body_com("target"))[:-1]) rbf4_ = np.array([-np.square(action[0]).sum()]) feat_rewards[0] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) demo_features = np.concatenate([ demo_features, feat_rewards.reshape(1, args.num_processes, -1) ], 0) if step > 1 and step % 1000 == 0: done = [True for _ in range(args.num_processes)] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, \ value, reward, masks, feat_rewards) # Save demos: action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy' state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy' rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str( j) + '.npy' policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth' np.save(action_file_name, demo_actions) np.save(state_file_name, demo_states) np.save(rew_feat_file_name, demo_features) torch.save(actor_critic.state_dict(), policy_file_name) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) #print(acc_scores) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act(obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): ''' Train PPO policies on each of the training environments. ''' args = get_args() try: os.makedirs(args.log_dir) except OSError: pass torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ep_reward = np.zeros(args.num_processes) episode_rewards = deque(maxlen=100) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obs reward and next obs obs, reward, done, infos = envs.step(action) if 'spaceship' in args.env_name: # spaceship, swimmer for i in range(len(done)): if done[i]: episode_rewards.append(reward[i].item()) # elif 'swimmer' in args.env_name: else: for i in range(len(done)): ep_reward[i] += reward[i].numpy().item() if done[i]: episode_rewards.append(ep_reward[i]) ep_reward[i] = 0 # if 'ant' in args.env_name: # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda, True) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save( actor_critic.state_dict(), os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\ .format(args.env_name, args.default_ind, args.seed)) ) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdates {}, num timesteps {}, Last {} training episodes: \ \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, device) envs.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_env(args.env_name, args.seed, args.gamma) model = MujocoModel(envs.observation_space.shape[0], envs.action_space.shape[0]) model.to(device) algorithm = PPO(model, args.clip_param, args.value_loss_coef, args.entropy_coef, initial_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent = MujocoAgent(algorithm, device) rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], envs.action_space.shape[0]) obs = envs.reset() rollouts.obs[0] = np.copy(obs) episode_rewards = deque(maxlen=10) num_updates = int(args.num_env_steps) // args.num_steps for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(algorithm.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = agent.sample( rollouts.obs[step]) # why use obs from rollouts???有病吧 # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.append(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.value(rollouts.obs[-1]) value_loss, action_loss, dist_entropy = agent.learn( next_value, args.gamma, args.gae_lambda, args.ppo_epoch, args.num_mini_batch, rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_steps print( "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms eval_mean_reward = evaluate(agent, ob_rms, args.env_name, args.seed, device)
print("start training") obs = agent.world.reset() print("initial", obs) rollouts.obs[0].copy_(torch.from_numpy(obs)) rollouts.to(device) start = time.time() num_updates = int(total_supposed_steps) // num_steps // num_processes all_return = [] all_length = [] for j in range(num_updates): print("runs", j + 1) if use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, initial_lr) cumul_return = [] episo_length = [] episode_rewards = [] episode_lengths = 0.0 for step in range(num_steps): # Sample actions with torch.no_grad(): agent.model.eval() value, action, action_log_prob = agent.sample_actions( rollouts.obs[step], rollouts.masks[step], device) # Obser reward and next obs action_world = action.cpu().numpy().reshape(-1) obs, reward, done, success = agent.world.step(action_world) episode_lengths += 1.0
def main(algorithm, opt, loss, ppo, normalization, alpha, seed, num_processes, num_steps, num_test_steps, num_stack, log_interval, test_log_interval, num_frames, reset_encoder_in_test, freeze_in_test, environment, tasks, test_tasks, architecture, num_env_restarts, warmup_period_frames, final_period_frames, load_id, testing_frames, option_init, num_simultaneous_restarts, save_dir, cuda, add_timestep, _run): import os if not os.path.exists(save_dir): os.makedirs(save_dir) # ACKTR currently broken assert algorithm in ['a2c', 'ppo'] # If all tasks are ints, convert them to actual ints try: tasks = list(map(int, tasks)) test_tasks = list(map(int, test_tasks)) except: pass num_tasks = len(tasks) num_processes_per_task = num_processes // num_tasks # num_frames = num_frames PER TASK num_updates = int(num_frames) * num_tasks // num_steps // num_processes print('Num updates:{}\n'.format(num_updates)) assert num_updates > 0, 'num_updates is 0, increase number of frames' # There will be `num_env_restarts` within the time between warmup_updates:(num_updates - # final_updates) # This leaves some warmup period and final training period to inspect the fully trained options warmup_updates = int(warmup_period_frames) * \ num_tasks // num_steps // num_processes final_updates = int(final_period_frames) * \ num_tasks // num_steps // num_processes testing_updates = int(testing_frames) * \ num_tasks // num_test_steps // num_processes restart_interval = (num_updates - warmup_updates - final_updates) // (num_env_restarts + 1) print('Num tasks:{}\nNum processes per task:{}\n'.format( num_tasks, num_processes_per_task)) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) print("#######") print( "WARNING: All rewards are clipped or normalized, but we are plotting the average return after clipping. Sacred plots will be inaccurate if per-timestep rewards are out of the range [-1, 1]" ) print("#######") torch.set_num_threads(1) envs = [ make_env(environment, seed, i, add_timestep) for i in range(num_tasks * num_processes_per_task) ] testing_envs = [ make_env(environment, seed, i, add_timestep) for i in range(num_tasks * num_processes_per_task) ] constraint = [] test_constraint = [] task_seed = [] for task in tasks: constraint += [task] * num_processes_per_task task_seed += [np.random.randint(LONG_NUMBER)] * num_processes_per_task for task in test_tasks: test_constraint += [task] * num_processes_per_task if num_processes > 1: envs = MTSubprocVecEnv(envs) testing_envs = MTSubprocVecEnv(testing_envs) else: envs = DummyVecEnv(envs) testing_envs = DummyVecEnv(testing_envs) if len(envs.observation_space.shape) == 1: envs = MTVecNormalize(envs, ob=normalization['ob'], ret=normalization['ret'], gamma=loss['gamma']) testing_envs = MTVecNormalize(testing_envs, ob=normalization['ob'], ret=False, gamma=loss['gamma']) returned_task_seed = envs.draw_and_set_task(constraint=constraint, seed=task_seed) testing_envs.draw_and_set_task(constraint=constraint, seed=returned_task_seed) print("Task seeds: {}".format(returned_task_seed)) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) hierarchical_actor_critic = HierarchicalPolicy(num_tasks, num_processes_per_task, alpha, obs_shape, envs.action_space, loss, architecture, option_init=option_init) if load_id is not None: docs = get_docs(db_uri, db_name, 'runs') doc = docs.find_one({'_id': load_id}) name = "model_after_training" # config = doc['config'] # config.update({'num_processes': len(config['tasks']), 'cuda': False}) file_id = get_file_id(doc=doc, file_name=name) save_file_from_db(file_id=file_id, destination='model_tmp_{}.pyt'.format(_run._id), db_uri=db_uri, db_name=db_name) state_dict = torch.load("model_tmp_{}.pyt".format(_run._id), map_location=lambda storage, loc: storage) hierarchical_actor_critic.load_state_dict(state_dict) os.remove('model_tmp_{}.pyt'.format(_run._id)) print("Loading model parameters complete.") if isinstance(envs, MTVecNormalize) and envs.ob_rms is not None: print("Loading ob_rms normalization") ob_name = name + ".npy" file_id = get_file_id(doc=doc, file_name=ob_name) save_file_from_db(file_id=file_id, destination='ob_rms_tmp.npy', db_uri=db_uri, db_name=db_name) rms_dict = np.load("ob_rms_tmp.npy")[()] print(rms_dict) envs.ob_rms.mean = rms_dict['mean'] envs.ob_rms.var = rms_dict['var'] envs.ob_rms.count = rms_dict['count'] testing_envs.ob_rms.mean = rms_dict['mean'] testing_envs.ob_rms.var = rms_dict['var'] testing_envs.ob_rms.count = rms_dict['count'] os.remove("ob_rms_tmp.npy") num_parameters = 0 for p in hierarchical_actor_critic.parameters(): num_parameters += p.nelement() num_params_master = 0 for p in hierarchical_actor_critic.masters[0].parameters(): num_params_master += p.nelement() num_params_option = 0 for p in hierarchical_actor_critic.options[0].parameters(): num_params_option += p.nelement() print(hierarchical_actor_critic) print("Total Number parameters: {}".format(num_parameters)) print("Number parameters master: {}".format(num_params_master)) print("Number parameters option: {}".format(num_params_option)) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: hierarchical_actor_critic.cuda() if algorithm == 'a2c': agent = algo.A2C(hierarchical_actor_critic, loss=loss, opt=opt) elif algorithm == 'ppo': agent = algo.PPO(hierarchical_actor_critic, loss, opt, ppo) elif algorithm == 'acktr': raise NotImplementedError("ACKTR not implemented with HRL") # agent = algo.A2C_ACKTR(hierarchical_actor_critic, value_loss_coef, # entropy_coef, acktr=True) def reset_envs(storage_length): rollouts = RolloutStorage(num_tasks, storage_length, num_processes_per_task, obs_shape, envs.action_space, loss) current_obs = torch.zeros(num_tasks, num_processes_per_task, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack, num_tasks, num_processes_per_task) for task in range(num_tasks): rollouts.obs[task, 0].copy_(current_obs[task]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) final_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_length = torch.zeros([num_tasks, num_processes_per_task, 1]) final_length = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) return (rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length, episode_terminations, final_terminations, master_terminations, final_master_terminations) rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length, \ episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs( storage_length=num_steps) start = time.time() hierarchical_actor_critic.train() rollout_length = num_steps assert num_tasks >= num_simultaneous_restarts randomSampler = data.sampler.BatchSampler( data.sampler.RandomSampler(range(num_tasks)), batch_size=num_simultaneous_restarts, drop_last=True) rndSampler_iter = iter(randomSampler) iterator = iter(range(num_updates + testing_updates)) for j in iterator: # Load old model if load_id is given if load_id is not None and j == 0: # Skip to j == num_updates - 1 next(islice(iterator, num_updates - 2, num_updates - 2), None) j = next(iterator) ppo['use_linear_clip_decay'] = False opt['use_lr_decay'] = False # Updated Learning rate j_mod = j % num_updates lr_schedule_length = num_updates if j <= num_updates else testing_updates if opt['use_lr_decay']: update_linear_schedule(agent.optimizer, j_mod, lr_schedule_length, opt['lr']) # Update clip param if algorithm == 'ppo' and ppo['use_linear_clip_decay']: agent.clip_param = ppo['clip_param'] * \ (1 - j_mod / float(lr_schedule_length)) # Update c_kl_b if loss['c_kl_b_1'] is not None: per = np.clip((j - warmup_updates) / (num_updates - final_updates), 0, 1) cur_val = (1 - per) * loss['c_kl_b_orig'] + per * loss['c_kl_b_1'] rollouts.loss['c_kl_b'] = cur_val if not loss['fixed_a']: rollouts.loss['c_kl_a'] = cur_val # Update c_kl_a if loss['c_kl_a_1'] is not None: per = np.clip((j - warmup_updates) / (num_updates - final_updates), 0, 1) cur_val = (1 - per) * loss['c_kl_a_orig'] + per * loss['c_kl_a_1'] rollouts.loss['c_kl_a'] = cur_val # if not loss['fixed_b']: # rollouts.loss['c_kl_a'] = cur_val # Update entropy_coef train_progress = j / (num_updates - final_updates) if not agent.hierarchical_actor_critic.training: # Testing elc = loss['entropy_loss_coef_test'] elif loss['entropy_loss_coef_1'] is not None: factor = max(0, 1 - train_progress) elc = (loss['entropy_loss_coef_0'] * factor + loss['entropy_loss_coef_1'] * (1 - factor)) else: elc = loss['entropy_loss_coef_0'] loss['elc'] = elc for step in range(rollout_length): # Sample actions """ Note regarding z: z_t is treated the same way as s_t with regards to saving because at t=0 we need access to s_{-1} and z_{t-1}. HOWEVER, that means that the code is off by one compared to the equations: In equations: z_t depends on s_t and z_{t-1} Here: z_t depends on s_{t-1} and z_{t-1} """ with torch.no_grad(): b, b_log_prob, _ = hierarchical_actor_critic.executePolicy( obs=rollouts.obs[:, step], z=rollouts.z[:, step], policy_type="termination", masks=rollouts.masks[:, step]) z, z_log_prob, _ = hierarchical_actor_critic.executePolicy( obs=rollouts.obs[:, step], z=rollouts.z[:, step], policy_type="master", b=b) action, action_log_prob, _ = hierarchical_actor_critic.executePolicy( obs=rollouts.obs[:, step], z=z, policy_type="option") # Evaluate Log probs for regularized reward b_prior_log_prob = hierarchical_actor_critic.evaluatePrior( obs=rollouts.obs[:, step], z=rollouts.z[:, step], action=b, policy_type="termination", masks=rollouts.masks[:, step]) action_prior_log_prob = hierarchical_actor_critic.evaluatePrior( obs=rollouts.obs[:, step], z=z, action=action, policy_type="option") value_pred = hierarchical_actor_critic.get_U( obs=rollouts.obs[:, step], previous_z=z) # Flatten actions: _, _, *action_shape = action.size() flat_action = action.view(num_tasks * num_processes_per_task, *action_shape) cpu_actions = flat_action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) single_obs_shape = obs.shape[1:] obs = np.reshape(np.stack(obs), (num_tasks, num_processes_per_task) + single_obs_shape) reward = np.reshape(np.stack(reward), (num_tasks, num_processes_per_task)) done = np.reshape(np.stack(done), (num_tasks, num_processes_per_task)) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 2)).float() episode_rewards += reward episode_length += 1 episode_terminations += b.cpu().float() delta_b = 1 - (z == rollouts.z[:, step]).int() master_terminations += delta_b.cpu().float() # If done then clean the history of observations. masks = torch.ones((num_tasks, num_processes_per_task, 1), dtype=torch.float32) for task in range(num_tasks): for process in range(num_processes_per_task): masks[task, process] = 0.0 if done[task][process] else 1.0 # Mask rewards final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks final_length *= masks final_length += (1 - masks) * episode_length episode_length *= masks final_terminations *= masks # It starts of with a termination final_terminations += (1 - masks) * (episode_terminations - 1) episode_terminations *= masks final_master_terminations *= masks # It starts of with a termination final_master_terminations += (1 - masks) * \ (master_terminations - 1) master_terminations *= masks # Mask observations if cuda: masks = masks.cuda() if current_obs.dim() == 5: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, num_stack, num_tasks, num_processes_per_task) rollouts.insert(current_obs=current_obs, z=z, b=b, action=action, value_pred=value_pred, action_log_prob=action_log_prob, action_prior_log_prob=action_prior_log_prob, z_log_prob=z_log_prob, b_log_prob=b_log_prob, b_prior_log_prob=b_prior_log_prob, reward=reward, mask=masks) with torch.no_grad(): # obs[-1] is s_{t+1} in equations # z[-1] is z_{t} in equations # Basically: Those are the last values we know which are s_{t+1} and z_t next_value_u = hierarchical_actor_critic.get_U( obs=rollouts.obs[:, -1], previous_z=rollouts.z[:, -1]) rollouts.store_next_value(next_value_u) rollouts.compute_returns() losses = agent.update(rollouts) rollouts.after_update() # While still in training and in between warmup_updates and final_updates if warmup_updates < j < num_updates and j < ( num_updates - final_updates) and ( j - warmup_updates) % restart_interval == 0: # Get tasks to reset try: next_restart_tasks = next(rndSampler_iter) except StopIteration as e: rndSampler_iter = iter(randomSampler) next_restart_tasks = next(rndSampler_iter) returned_task_seed = reset_task(next_restart_tasks, hierarchical_actor_critic, constraint, agent, returned_task_seed, envs, testing_envs) # load_master=train_load_master_params) # Unfortunately there isn't a simple nice way to only restart the environment that was resetted rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length,\ episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs( storage_length=num_steps) # When we reached the end of the training phase, reset all tasks if j == num_updates - 1: save_model(hierarchical_actor_critic, "model_after_training", envs) print("Reset all tasks, stop updating prior, start testing") last_training_task_seed = returned_task_seed.copy() hierarchical_actor_critic.eval() returned_task_seed = reset_task( restart_tasks=range(num_tasks), hierarchical_actor_critic=hierarchical_actor_critic, constraint=test_constraint, agent=agent, returned_task_seed=returned_task_seed, envs=envs, testing_envs=testing_envs) print("Freezing and resetting for test") hierarchical_actor_critic.frozen['prior'] = freeze_in_test['prior'] hierarchical_actor_critic.frozen['option'] = freeze_in_test[ 'option'] if architecture['shared_encoder']: hierarchical_actor_critic.split_encoder() # This will create a new Encoder! if reset_encoder_in_test['option']: hierarchical_actor_critic.reset_encoder('option') if reset_encoder_in_test['master']: hierarchical_actor_critic.reset_encoder('master') agent.init_optimizer(hierarchical_actor_critic) # Unfortunately there isn't a simple nice way to only restart the environment that was resetted rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length,\ episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs( storage_length=num_test_steps) rollout_length = num_test_steps if (j < num_updates and j % log_interval == 0) or (j >= num_updates and j % test_log_interval == 0): test_performance = test_policy(testing_envs, hierarchical_actor_critic) end = time.time() if j % (log_interval * 10) == 0: printHeader() if j < num_updates: total_num_steps = (j + 1) * num_processes * num_steps else: total_num_steps = ( num_updates * num_steps + (j + 1 - num_updates) * num_test_steps) * num_processes # FPS PER TASK (because num_frames is also per task!) fps = int(total_num_steps / num_tasks / (end - start)) logging.info('Updt: {:5} |{:5} {:5}|{:5}|{:5}|{:5}'.format( str(j / num_updates)[:5], str(fps), str(final_rewards.mean().item())[:5], str(final_rewards.median().item())[:5], str(final_rewards.min().item())[:5], str(final_rewards.max().item())[:5], )) for task in range(num_tasks): _run.log_scalar('return.avg.{}'.format(task), float(final_rewards[task].mean()), total_num_steps // num_tasks) _run.log_scalar('return.test.avg.{}'.format(task), float(test_performance[task].mean()), total_num_steps // num_tasks) _run.log_scalar('return.avg', final_rewards.mean().item(), total_num_steps // num_tasks) _run.log_scalar('return.test.avg', test_performance.mean().item(), total_num_steps // num_tasks) _run.log_scalar('episode.length', final_length.mean().item(), total_num_steps // num_tasks) _run.log_scalar('episode.terminations', final_terminations.mean().item(), total_num_steps // num_tasks) _run.log_scalar('episode.master_terminations', final_master_terminations.mean().item(), total_num_steps // num_tasks) _run.log_scalar('fps', fps, total_num_steps // num_tasks) _run.log_scalar('loss.value', losses['value_loss'], total_num_steps // num_tasks) _run.log_scalar('loss.action_a', losses['action_loss_a'], total_num_steps // num_tasks) _run.log_scalar('loss.action_z', losses['action_loss_z'], total_num_steps // num_tasks) _run.log_scalar('loss.action_b', losses['action_loss_b'], total_num_steps // num_tasks) _run.log_scalar('loss.action_prior', losses['action_prior_loss'], total_num_steps // num_tasks) _run.log_scalar('loss.b_prior', losses['b_prior_loss'], total_num_steps // num_tasks) _run.log_scalar('loss.entropy_a', losses['entropy_a'], total_num_steps // num_tasks) _run.log_scalar('loss.entropy_b', losses['entropy_b'], total_num_steps // num_tasks) _run.log_scalar('loss.entropy_z', losses['entropy_z'], total_num_steps // num_tasks) _run.info["seeds_final"] = returned_task_seed # _run.info["last_training_task_seed"] = last_training_task_seed _run.info["constraints_final"] = constraint _run.info['test_constraints_final'] = test_constraint save_model(hierarchical_actor_critic, "final_model", envs)