def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device, custom_gym, save_path): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True, custom_gym) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] eval_episode_length = [] eval_episode_success_rate = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < num_processes * 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_episode_length.append(info['episode']['l']) eval_episode_success_rate.append( info['was_successful_trajectory']) eval_envs.close() print( " Evaluation using {} episodes: mean reward {:.5f}, mean_length {:.2f}, mean_success {:.2f} \n" .format(len(eval_episode_rewards), np.mean(eval_episode_rewards), np.mean(eval_episode_length), np.mean(eval_episode_success_rate))) if actor_critic.max_eval_success_rate <= np.mean( eval_episode_success_rate): actor_critic.max_eval_success_rate = np.mean(eval_episode_success_rate) torch.save([ actor_critic, getattr(utils.get_vec_normalize(eval_envs), 'ob_rms', None) ], os.path.join(save_path, str(seed) + "_best_test.pt"))
def __init__(self, args, actor_critic, device): eval_args = args #eval_args.render = True self.device = device #if args.model == 'fractal': # for i in range(-1, args.n_recs): # eval_log_dir = args.log_dir + "_eval_col_{}".format(i) # try: # os.makedirs(eval_log_dir) # except OSError: # files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) # for f in files: # os.remove(f) # setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir) self.eval_log_dir = args.log_dir + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.num_eval_processes = 2 self.eval_envs = make_vec_envs( eval_args.env_name, eval_args.seed + self.num_eval_processes, self.num_eval_processes, eval_args.gamma, self.eval_log_dir, eval_args.add_timestep, self.device, True, args=eval_args) self.vec_norm = get_vec_normalize(self.eval_envs) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms self.actor_critic = actor_critic self.tstart = time.time() fieldnames = ['r', 'l', 't'] if args.model == 'fractal': n_cols = actor_critic.base.n_cols for i in range(-1, n_cols): log_file_col = open('{}/col_{}_eval.csv'.format(self.eval_log_dir, i), mode='w') setattr(self, 'log_file_col_{}'.format(i), log_file_col) writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames) setattr(self, 'writer_col_{}'.format(i), writer_col) writer_col.writeheader() log_file_col.flush() else: self.log_file = open('{}/col_evals.csv'.format(self.eval_log_dir), mode='w') self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.log_file.flush() self.args = eval_args
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): config = None args = get_args() config, checkpoint = get_config_and_checkpoint(args) set_random_seeds(args, config) eval_log_dir = args.save_dir + "_eval" try: os.makedirs(args.save_dir) os.makedirs(eval_log_dir) except OSError: pass now = datetime.datetime.now() experiment_name = args.experiment_name + '_' + now.strftime( "%Y-%m-%d_%H-%M-%S") # Create checkpoint file save_dir_model = os.path.join(args.save_dir, 'model', experiment_name) save_dir_config = os.path.join(args.save_dir, 'config', experiment_name) try: os.makedirs(save_dir_model) os.makedirs(save_dir_config) except OSError as e: logger.error(e) exit() if args.config: shutil.copy2(args.config, save_dir_config) # Tensorboard Logging writer = SummaryWriter( os.path.join(args.save_dir, 'tensorboard', experiment_name)) # Logger that writes to STDOUT and a file in the save_dir logger = setup_carla_logger(args.save_dir, experiment_name) device = torch.device("cuda:0" if args.cuda else "cpu") norm_reward = not config.no_reward_norm norm_obs = not config.no_obs_norm assert not (config.num_virtual_goals > 0) or ( config.reward_class == 'SparseReward'), 'Cant use HER with dense reward' obs_converter = CarlaObservationConverter( h=84, w=84, rel_coord_system=config.rel_coord_system) action_converter = CarlaActionsConverter(config.action_type) envs = make_vec_envs(obs_converter, action_converter, args.starting_port, config.seed, config.num_processes, config.gamma, device, config.reward_class, num_frame_stack=1, subset=config.experiments_subset, norm_reward=norm_reward, norm_obs=norm_obs, apply_her=config.num_virtual_goals > 0, video_every=args.video_interval, video_dir=os.path.join(args.save_dir, 'video', experiment_name)) if config.agent == 'forward': agent = agents.ForwardCarla() if config.agent == 'a2c': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.agent == 'acktr': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, acktr=True) elif config.agent == 'ppo': agent = agents.PPOCarla(obs_converter, action_converter, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) if checkpoint is not None: load_modules(agent.optimizer, agent.model, checkpoint) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space, envs.action_space, 20, config.num_virtual_goals, config.rel_coord_system, obs_converter) obs = envs.reset() # Save the first observation obs = obs_to_dict(obs) rollouts.obs = obs_to_dict(rollouts.obs) for k in rollouts.obs: rollouts.obs[k][rollouts.step + 1].copy_(obs[k]) rollouts.obs = dict_to_obs(rollouts.obs) rollouts.to(device) start = time.time() total_steps = 0 total_episodes = 0 total_reward = 0 episode_reward = torch.zeros(config.num_processes) for j in range(config.num_updates): for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, info = envs.step(action) # For logging purposes carla_rewards = torch.tensor([i['carla-reward'] for i in info], dtype=torch.float) episode_reward += carla_rewards total_reward += carla_rewards.sum().item() total_steps += config.num_processes if done.any(): total_episodes += done.sum() torch_done = torch.tensor(done.astype(int)).byte() mean_episode_reward = episode_reward[torch_done].mean().item() logger.info('{} episode(s) finished with reward {}'.format( done.sum(), mean_episode_reward)) writer.add_scalar('train/mean_ep_reward_vs_steps', mean_episode_reward, total_steps) writer.add_scalar('train/mean_ep_reward_vs_episodes', mean_episode_reward, total_episodes) episode_reward[torch_done] = 0 # If done then clean the history of observations. masks = torch.FloatTensor(1 - done) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks.unsqueeze(-1)) if config.num_virtual_goals > 0: rollouts.apply_her(config.num_virtual_goals, device, beta=config.beta) with torch.no_grad(): next_value = agent.get_value( rollouts.get_obs(-1), # Get last observation rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward': save_path = os.path.join(save_dir_model, str(j) + '.pth.tar') save_modules(agent.optimizer, agent.model, args, config, save_path) total_num_steps = (j + 1) * config.num_processes * config.num_steps if j % args.log_interval == 0: # Logging to the stdout/our logs end = time.time() logger.info('------------------------------------') logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\ .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start))) logger.info('------------------------------------') # Logging to tensorboard writer.add_scalar('train/cum_reward_vs_steps', total_reward, total_steps) writer.add_scalar('train/cum_reward_vs_updates', total_reward, j + 1) if config.agent in ['a2c', 'acktr', 'ppo']: writer.add_scalar('debug/value_loss_vs_steps', value_loss, total_steps) writer.add_scalar('debug/value_loss_vs_updates', value_loss, j + 1) writer.add_scalar('debug/action_loss_vs_steps', action_loss, total_steps) writer.add_scalar('debug/action_loss_vs_updates', action_loss, j + 1) writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy, total_steps) writer.add_scalar('debug/dist_entropy_vs_updates', dist_entropy, j + 1) # Sample the last reward writer.add_scalar('debug/sampled_normalized_reward_vs_steps', reward.mean(), total_steps) writer.add_scalar('debug/sampled_normalized_reward_vs_updates', reward.mean(), j + 1) writer.add_scalar('debug/sampled_carla_reward_vs_steps', carla_rewards.mean(), total_steps) writer.add_scalar('debug/sampled_carla_reward_vs_updates', carla_rewards.mean(), j + 1) if (args.eval_interval is not None and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.starting_port, obs_converter, args.x + config.num_processes, config.num_processes, config.gamma, eval_log_dir, config.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(config.num_processes, 20, device=device) eval_masks = torch.zeros(config.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs carla_obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() logger.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, args.custom_gym) base = SEVN actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'ppo': agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() writer.add_scalars('Train/Episode Reward', { "Reward Mean": np.mean(episode_rewards), "Reward Min": np.min(episode_rewards), "Reward Max": np.max(episode_rewards) }, global_step=total_num_steps) writer.add_scalars('Train/Episode Length', { "Episode Length Mean": np.mean(episode_length), "Episode Length Min": np.min(episode_length), "Episode Length Max": np.max(episode_length) }, global_step=total_num_steps) writer.add_scalar("Train/Episode Reward Mean", np.mean(episode_rewards), global_step=total_num_steps) writer.add_scalar("Train/Episode Length Mean", np.mean(episode_length), global_step=total_num_steps) writer.add_scalar("Train/Episode Success Rate", np.mean(episode_success_rate), global_step=total_num_steps) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic.load_state_dict(actor_critic.state_dict()) actor_critic.to(device) average_actor_critic.to(device) agent = algo.ACER_AGENT(actor_critic, average_actor_critic, args.value_loss_coef, args.entropy_coef, args.gamma, args.clip, args.no_trust_region, args.alpha, args.delta, lr=args.lr, eps=args.eps, rms_alpha=args.rms_alpha, max_grad_norm=args.max_grad_norm) buffer = Buffer(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args.buffer_size) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) off_rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) off_rollouts.to(device) episode_rewards = deque(maxlen=10) acer = algo.ACER(actor_critic, rollouts, off_rollouts, buffer, episode_rewards, agent, envs) start = time.time() for j in range(num_updates): # On-policy ACER value_loss, action_loss, dist_entropy = acer.call(on_policy=True) if args.replay_ratio > 0 and buffer.has_atleast(args.replay_start): # Off-policy ACER n = np.random.poisson(args.replay_ratio) for _ in range(n): acer.call(on_policy=False) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\ndist_entropy {:.1f}, value/action loss {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) eval_episode_rewards = [] obs = eval_envs.reset().to(device) eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, _, _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) obs = obs.to(device) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args_iko.cuda else "cpu") if args_iko.vis: from visdom import Visdom viz = Visdom(port=args_iko.port) win = None envs = make_vec_envs(args_iko.env_name, args_iko.seed, args_iko.num_processes, args_iko.gamma, args_iko.log_dir, args_iko.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_iko.recurrent_policy}) actor_critic.to(device) action_shape = 3 reward_model = RewardModel(11 * 11 * 6, 1, 64, 64) reward_model.to(device) if args_iko.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, lr=args_iko.lr, eps=args_iko.eps, alpha=args_iko.alpha, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'ppo': agent = algo.PPO(actor_critic, args_iko.clip_param, args_iko.ppo_epoch, args_iko.num_mini_batch, args_iko.value_loss_coef, args_iko.entropy_coef, args_iko.use_singh, reward_model, lr=args_iko.lr, eps=args_iko.eps, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, acktr=True) rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args_iko.use_linear_lr_decay: # decrease learning rate linearly if args_iko.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args_iko.lr) if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay: agent.clip_param = args_iko.clip_param * (1 - j / float(num_updates)) reward_train = [] reward_block_penalty = [] reward_bel_gt = [] reward_bel_gt_nonlog = [] reward_infogain = [] reward_bel_ent = [] reward_hit = [] reward_dist = [] reward_inv_dist = [] for step in range(args_iko.num_steps): # Sample actions # print(step, args_iko.num_steps) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) reward_train.append(reward) # print("infos is ", infos) # reward_b.append(infos[0]['auxillary_reward']) # print("infos is ",infos[0]['auxillary_reward']) reward_block_penalty.append(infos[0]['reward_block_penalty']) reward_bel_gt.append(infos[0]['reward_bel_gt']) reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog']) reward_infogain.append(infos[0]['reward_infogain']) reward_bel_ent.append(infos[0]['reward_bel_ent']) reward_hit.append(infos[0]['reward_hit']) reward_dist.append(infos[0]['reward_dist']) reward_inv_dist.append(infos[0]['reward_inv_dist']) # print(reward) reward.to(device) reward_model.to(device) if args_iko.use_singh: # print("using learning IR") my_reward = reward_model(obs.clone().to(device), action.clone().float()).detach() my_reward.to(device) reward = reward + args_iko.singh_coef * my_reward.type( torch.FloatTensor) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # print("infos is ",infos[0]['auxillary_reward']) # print("info is",info['episode']['r'] ) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # print("mean reward_a", np.mean(reward_train)) # print("mean reward_block_penalty", np.mean(reward_block_penalty)) # print("mean reward_bel_gt", np.mean(reward_bel_gt)) # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog)) # print("mean reward_infogain", np.mean(reward_infogain)) # print("mean reward_bel_ent", np.mean(reward_bel_ent)) # print("mean reward_hit", np.mean(reward_hit)) # print("mean reward_dist", np.mean(reward_dist)) # print("mean reward_inv_dist", np.mean(reward_inv_dist)) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps writer.add_scalar('mean_reward_train', np.mean(reward_train), total_num_steps) writer.add_scalar('mean_reward_block_penalty', np.mean(reward_block_penalty), total_num_steps) writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt), total_num_steps) writer.add_scalar('mean_reward_bel_gt_nonlog', np.mean(reward_bel_gt_nonlog), total_num_steps) writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain), total_num_steps) writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent), total_num_steps) writer.add_scalar('mean_reward_hit', np.mean(reward_hit), total_num_steps) writer.add_scalar('mean_reward_dist', np.mean(reward_dist), total_num_steps) writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist), total_num_steps) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma, args_iko.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args_iko.save_interval == 0 or j == num_updates - 1) and args_iko.save_dir != "": save_path = os.path.join(args_iko.save_dir, args_iko.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args_iko.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, 'ugl' + str(args_iko.use_gt_likelihood) + 'block-pen-' + str(args_iko.penalty_for_block) + '_' + 'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' + str(args_iko.rew_bel_new) + '_' + 'bel-ent-' + str(args_iko.rew_bel_ent) + '_' + 'infogain-' + str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' + str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' + str(args_iko.rew_bel_gt) + '_' + 'dist-' + str(args_iko.rew_dist) + '_' + 'hit-' + str(args_iko.rew_hit) + '_' + 'inv-dist-' + str(args_iko.rew_inv_dist) + args_iko.algo + ".pt")) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps if j % args_iko.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("mean reward_a", np.mean(reward_a)) print("mean_reward_b", np.mean(reward_b)) # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # len(episode_rewards), # np.mean(episode_rewards), # np.median(episode_rewards), # np.min(episode_rewards), # np.max(episode_rewards), dist_entropy, # value_loss, action_loss)) # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps) # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps) # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps) if (args_iko.eval_interval is not None and len(episode_rewards) > 1 and j % args_iko.eval_interval == 0): eval_envs = make_vec_envs(args_iko.env_name, args_iko.seed + args_iko.num_processes, args_iko.num_processes, args_iko.gamma, eval_log_dir, args_iko.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args_iko.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args_iko.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args_iko.vis and j % args_iko.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args_iko.log_dir, args_iko.env_name, args_iko.algo, args_iko.num_env_steps) except IOError: pass writer.close()
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # get cloned policy and recovered reward function policy_reward_dir = args.rewards_dir policy_dir = args.policies_dir policy_reward = Policy(envs.observation_space.shape, envs.action_space) policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth' policy_reward_sd = torch.load(policy_reward_file_name) policy_reward.load_state_dict(policy_reward_sd) actor_critic = Policy(envs.observation_space.shape, envs.action_space) policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth' policy_sd = torch.load(policy_file_name) actor_critic.load_state_dict(policy_sd) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, _, done, infos = envs.step(action) if step > 1 and step % 1000 == 0: done = True # use infered reward: with torch.no_grad(): # _, reward = shapes(rollouts.obs[step], 0) _, action_log_probs, _, _ = policy_reward.evaluate_actions( rollouts.obs[step], None, None, action) reward = action_log_probs for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) r = 0 for key, val in info.items(): if 'reward' in key: r += val episode_rewards.append(r) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act( obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def __init__(self, args, im_log_dir): self.im_log_dir = im_log_dir self.log_dir = args.load_dir env_name = args.env_name if torch.cuda.is_available() and not args.no_cuda: args.cuda = True device = torch.device('cuda') map_location = torch.device('cuda') else: args.cuda = False device = torch.device('cpu') map_location = torch.device('cpu') try: checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'), map_location=map_location) except FileNotFoundError: print('load-dir does not start with valid gym environment id, using command line args') env_name = args.env_name checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'), map_location=map_location) saved_args = checkpoint['args'] past_frames = checkpoint['n_frames'] args.past_frames = past_frames env_name = saved_args.env_name if 'Micropolis' in env_name: args.power_puzzle = saved_args.power_puzzle if not args.evaluate and not 'GoLMulti' in env_name: # assume we just want to observe/interact w/ a single env. args.num_proc = 1 dummy_args = args envs = make_vec_envs(env_name, args.seed + 1000, args.num_processes, None, args.load_dir, args.add_timestep, device=device, allow_early_resets=False, args=dummy_args) print(args.load_dir) if isinstance(envs.observation_space, gym.spaces.Discrete): in_width = 1 num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if len(envs.observation_space.shape) == 3: in_w = envs.observation_space.shape[1] in_h = envs.observation_space.shape[2] else: in_w = 1 in_h = 1 num_inputs = envs.observation_space.shape[0] if isinstance(envs.action_space, gym.spaces.Discrete): out_w = 1 out_h = 1 num_actions = int(envs.action_space.n // (in_w * in_h)) #if 'Micropolis' in env_name: # num_actions = env.venv.venv.envs[0].num_tools #elif 'GameOfLife' in env_name: # num_actions = 1 #else: # num_actions = env.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): out_w = envs.action_space.shape[0] out_h = envs.action_space.shape[1] num_actions = envs.action_space.shape[-1] # We need to use the same statistics for normalization as used in training #actor_critic, ob_rms = \ # torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) if saved_args.model == 'fractal': saved_args.model = 'FractalNet' actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'recurrent': args.recurrent_policy, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h }, curiosity=args.curiosity, algo=saved_args.algo, model=saved_args.model, args=saved_args) actor_critic.to(device) torch.nn.Module.dump_patches = True actor_critic.load_state_dict(checkpoint['model_state_dict']) ob_rms = checkpoint['ob_rms'] if 'fractal' in args.model.lower(): new_recs = args.n_recs - saved_args.n_recs for nr in range(new_recs): actor_critic.base.auto_expand() print('expanded network:\n', actor_critic.base) if args.active_column is not None \ and hasattr(actor_critic.base, 'set_active_column'): actor_critic.base.set_active_column(args.active_column) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms self.actor_critic = actor_critic self.envs = envs self.args = args
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('args.lr') print(args.lr) # print('args.stat_decay') # print(args.stat_decay) # sys.exit() if args.algo == 'a2c': # print('args.eps') # print(args.eps) # sys.exit() agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo in ['acktr']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o-noEigen']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay, if_eigen=False) elif args.algo in ['kbfgs']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-invertA']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay, if_invert_A=True) elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay_A=args.stat_decay_A, stat_decay_G=args.stat_decay_G, if_invert_A=True, if_decoupled_decay=True) elif args.algo in ['kbfgs-h**o-momentumGrad']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_momentumGrad=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-noClip']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_clip=False, stat_decay=args.stat_decay) else: print('unknown args.algo for ' + args.algo) sys.exit() rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) record_rewards = [] record_num_steps = [] print('num_updates') print(num_updates) total_num_steps = 0 start = time.time() for j in range(num_updates): print('j') print(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # print('info.keys()') # print(info.keys()) if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) print('info[episode][r]') print(info['episode']['r']) record_rewards.append(info['episode']['r']) # print('total_num_steps') # print(total_num_steps) # print('total_num_steps + (step + 1) * args.num_processes') # print(total_num_steps + (step + 1) * args.num_processes) record_num_steps.append(total_num_steps + (step + 1) * args.num_processes) # sys.exit() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, update_signal = agent.update( rollouts) if update_signal == -1: # sys.exit() break rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass print('record_rewards') print(record_rewards) dir_with_params = args.env_name + '/' +\ args.algo + '/' +\ 'eps_' + str(args.eps) + '/' +\ 'lr_' + str(args.lr) + '/' +\ 'stat_decay_' + str(args.stat_decay) + '/' # saving_dir = './result/' + args.env_name + '/' + args.algo + '/' saving_dir = './result/' + dir_with_params if not os.path.isdir(saving_dir): os.makedirs(saving_dir) import pickle with open(saving_dir + 'result.pkl', 'wb') as handle: pickle.dump( { 'record_rewards': record_rewards, 'record_num_steps': record_num_steps }, handle) print('args.log_dir') print(args.log_dir) print('os.listdir(args.log_dir)') print(os.listdir(args.log_dir)) # saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/' saving_dir_monitor = './result_monitor/' + dir_with_params if os.path.isdir(saving_dir_monitor): import shutil shutil.rmtree(saving_dir_monitor) if not os.path.isdir(saving_dir_monitor): os.makedirs(saving_dir_monitor) print('saving_dir_monitor') print(saving_dir_monitor) import shutil for file_name in os.listdir(args.log_dir): full_file_name = os.path.join(args.log_dir, file_name) print('full_file_name') print(full_file_name) print('os.path.isfile(full_file_name)') print(os.path.isfile(full_file_name)) if os.path.isfile(full_file_name): shutil.copy(full_file_name, saving_dir_monitor) # print('os.listdir(saving_dir_monitor)') # print(os.listdir(saving_dir_monitor)) # print('len(os.listdir(saving_dir_monitor))') # print(len(os.listdir(saving_dir_monitor))) # print('args.num_processes') # print(args.num_processes) assert len(os.listdir(saving_dir_monitor)) == args.num_processes
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_env(args.env_name, args.seed, args.gamma) model = MujocoModel(envs.observation_space.shape[0], envs.action_space.shape[0]) model.to(device) algorithm = PPO(model, args.clip_param, args.value_loss_coef, args.entropy_coef, initial_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent = MujocoAgent(algorithm, device) rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], envs.action_space.shape[0]) obs = envs.reset() rollouts.obs[0] = np.copy(obs) episode_rewards = deque(maxlen=10) num_updates = int(args.num_env_steps) // args.num_steps for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(algorithm.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = agent.sample( rollouts.obs[step]) # why use obs from rollouts???有病吧 # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.append(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.value(rollouts.obs[-1]) value_loss, action_loss, dist_entropy = agent.learn( next_value, args.gamma, args.gae_lambda, args.ppo_epoch, args.num_mini_batch, rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_steps print( "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms eval_mean_reward = evaluate(agent, ob_rms, args.env_name, args.seed, device)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) with open(log_dir + 'extras.csv', "w") as file: file.write("n, value_loss\n") torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) model = Policy(envs.observation_space.shape, envs.action_space.n, extra_kwargs={'use_backpack': args.algo == 'tdprop'}) model.to(device) if args.algo == 'tdprop': from algo.sarsa_tdprop import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, n=args.num_steps, num_processes=args.num_processes, gamma=args.gamma) else: from algo.sarsa import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, algo=args.algo) explore_policy = utils.eps_greedy rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, model.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): qs = model(rollouts.obs[step]) _, dist = explore_policy(qs, args.exploration) actions = dist.sample().unsqueeze(-1) value = qs.gather(-1, actions) # Obser reward and next obs obs, reward, done, infos = envs.step(actions) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value, value, reward, masks, bad_masks) with torch.no_grad(): next_qs = model(rollouts.obs[-1]) next_probs, _ = explore_policy(next_qs, args.exploration) next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1) rollouts.compute_returns(next_value, args.gamma) value_loss = agent.update(rollouts, explore_policy, args.exploration) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1): save_path = os.path.join(args.log_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ list(model.parameters()), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( ("Updates {}, num timesteps {}, FPS {}\n" + \ "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \ ", min/max reward {:.1f}/{:.1f}\n" + \ "entropy {:.2f}, value loss {:.4f}") .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist.entropy().mean().item(), value_loss)) with open(log_dir + 'extras.csv', "a") as file: file.write( str(total_num_steps) + ", " + str(value_loss) + "\n")
def __init__(self, args, actor_critic, device, envs=None, vec_norm=None, frozen=False): ''' frozen: we are not in the main training loop, but evaluating frozen model separately''' if frozen: self.win_eval = None past_steps = args.past_steps self.frozen = frozen #eval_args.render = True self.device = device #if args.model == 'fractal': # for i in range(-1, args.n_recs): # eval_log_dir = args.log_dir + "_eval_col_{}".format(i) # try: # os.makedirs(eval_log_dir) # except OSError: # files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) # for f in files: # os.remove(f) # setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir) if frozen: if 'GameOfLife' in args.env_name: self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s_{}pl".format(past_steps, args.map_width, args.n_recs, args.max_step, args.prob_life, '.1f') else: self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s".format(past_steps, args.map_width, args.n_recs, args.max_step, '.1f') merge_col_logs = True else: self.eval_log_dir = args.log_dir + "_eval" merge_col_logs = False try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) files += glob.glob(os.path.join(self.eval_log_dir, '*_eval.csv')) if args.overwrite: for f in files: os.remove(f) elif files: merge_col_logs = True self.args = args self.actor_critic = actor_critic self.num_eval_processes = args.num_processes if envs: self.eval_envs = envs self.vec_norm = vec_norm else: #print('making envs in Evaluator: ', self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes, # self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, True, self.args) self.eval_envs = make_vec_envs( self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes, self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, False, args=self.args) self.vec_norm = get_vec_normalize(self.eval_envs) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms self.tstart = time.time() fieldnames = ['r', 'l', 't'] model = actor_critic.base if args.model == 'FractalNet' or args.model =='fractal': n_cols = model.n_cols else: n_cols = 0 self.plotter = Plotter(n_cols, self.eval_log_dir, self.num_eval_processes, max_steps=self.args.max_step) eval_cols = range(-1, n_cols) if args.model == 'fixed' and model.RAND: eval_cols = model.eval_recs if eval_cols is not None: for i in eval_cols: log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i) if merge_col_logs and os.path.exists(log_file): merge_col_log = True else: merge_col_log = False if merge_col_log: if len(eval_cols) > 1 and i == eval_cols[-2] and self.args.auto_expand: # problem if we saved model after auto-expanding, without first evaluating! # for the newly added column, we duplicate the last col.'s records new_col_log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i + 1) copyfile(log_file, new_col_log_file) old_log = '{}_old'.format(log_file) os.rename(log_file, old_log) log_file_col = open(log_file, mode='w') setattr(self, 'log_file_col_{}'.format(i), log_file_col) writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames) setattr(self, 'writer_col_{}'.format(i), writer_col) if merge_col_log: with open(old_log, newline='') as old: reader = csv.DictReader(old, fieldnames=('r', 'l', 't')) h = 0 try: # in case of null bytes resulting from interrupted logging for row in reader: if h > 1: row['t'] = 0.0001 * h # HACK: false times for past logs to maintain order writer_col.writerow(row) h += 1 except csv.Error: h_i = 0 for row in reader: if h_i > h: row['t'] = 0.0001 * h_i # HACK: false times for past logs to maintain order writer_col.writerow(row) h_i += 1 os.remove(old_log) else: writer_col.writeheader() log_file_col.flush()
def main(): ''' Train PPO policies on each of the training environments. ''' args = get_args() try: os.makedirs(args.log_dir) except OSError: pass torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ep_reward = np.zeros(args.num_processes) episode_rewards = deque(maxlen=100) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obs reward and next obs obs, reward, done, infos = envs.step(action) if 'spaceship' in args.env_name: # spaceship, swimmer for i in range(len(done)): if done[i]: episode_rewards.append(reward[i].item()) # elif 'swimmer' in args.env_name: else: for i in range(len(done)): ep_reward[i] += reward[i].numpy().item() if done[i]: episode_rewards.append(ep_reward[i]) ep_reward[i] = 0 # if 'ant' in args.env_name: # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda, True) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save( actor_critic.state_dict(), os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\ .format(args.env_name, args.default_ind, args.seed)) ) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdates {}, num timesteps {}, Last {} training episodes: \ \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, device) envs.close()
def main(): actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None import torch.multiprocessing as multiprocessing multiprocessing.set_start_method('spawn') torch.manual_seed(args.seed) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if args.cuda: torch.cuda.manual_seed(args.seed) num_actions = 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy }, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) evaluator = None if not agent: if args.algo == 'a2c': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, curiosity=args.curiosity, args=args) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True, curiosity=args.curiosity, args=args) #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') saved_model = os.path.join(args.save_dir, args.env_name + '.tar') if os.path.exists(saved_model) and not args.overwrite: checkpoint = torch.load(saved_model) actor_critic.load_state_dict(checkpoint['model_state_dict']) actor_critic.to(device) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) past_steps = checkpoint['past_steps'] ob_rms = checkpoint['ob_rms'] past_steps = next(iter( agent.optimizer.state_dict()['state'].values()))['step'] saved_args = checkpoint['args'] new_recs = args.n_recs - saved_args.n_recs for nr in range(new_recs): actor_critic.base.auto_expand() if saved_args.n_recs > args.n_recs: print('applying {} fractal expansions to network'.format( saved_args.n_recs - args.n_recs)) print('Resuming from step {}'.format(past_steps)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms actor_critic.to(device) if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size( ) else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() model = actor_critic.base done = False for j in range(past_steps, num_updates): if args.model == 'fractal' and args.drop_path: model.set_drop_path() if args.model == 'fixed' and model.RAND: model.num_recursions = random.randint(1, model.map_width * 2) player_act = None for step in range(args.num_steps): # if type(done) is not bool: # if done.any(): # obs = envs.reset() # elif done: # obs = env.reset() # Sample actions with torch.no_grad(): value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity, deterministic=False) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin)) intrinsic_reward = args.eta * ( (feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update( rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None) save_model = copy.deepcopy(actor_critic) save_agent = copy.deepcopy(agent) if args.cuda: save_model.cpu() optim_save = save_agent.optimizer.state_dict() # experimental: torch.save( { 'past_steps': step, 'model_state_dict': save_model.state_dict(), 'optimizer_state_dict': optim_save, 'ob_rms': ob_rms, 'args': args }, os.path.join(save_path, args.env_name + ".tar")) #save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #save_agent = copy.deepcopy(agent) #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".format( j, total_num_steps, int((total_num_steps - past_steps * args.num_processes * args.num_steps) / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n".format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device) model = evaluator.actor_critic.base if args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 col_idx = [-1, *range(0, n_cols, col_step)] for i in col_idx: evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step # making sure the evaluator plots the '-1'st column (the overall net) win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs=col_idx) elif args.model == 'fixed' and model.RAND: for i in model.eval_recs: evaluator.evaluate(num_recursions=i) win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs=model.eval_recs) else: evaluator.evaluate(column=None) win_eval = visdom_plot( viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames * 20 / (args.eval_interval * args.num_steps)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, graph_name, args.algo, args.num_frames) except IOError: pass
def __init__(self, args, actor_critic, device): eval_args = args #eval_args.render = True self.device = device #if args.model == 'fractal': # for i in range(-1, args.n_recs): # eval_log_dir = args.log_dir + "_eval_col_{}".format(i) # try: # os.makedirs(eval_log_dir) # except OSError: # files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) # for f in files: # os.remove(f) # setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir) self.eval_log_dir = args.log_dir + "_eval" merge_col_logs = False try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) if args.overwrite: for f in files: os.remove(f) elif files: merge_col_logs = True self.num_eval_processes = 20 self.eval_envs = make_vec_envs(eval_args.env_name, eval_args.seed + self.num_eval_processes, self.num_eval_processes, eval_args.gamma, self.eval_log_dir, eval_args.add_timestep, self.device, True, args=eval_args) self.vec_norm = get_vec_normalize(self.eval_envs) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms self.actor_critic = actor_critic self.tstart = time.time() fieldnames = ['r', 'l', 't'] model = actor_critic.base if args.model == 'fractal': n_cols = model.n_cols eval_cols = range(-1, n_cols) elif args.model == 'fixed' and model.RAND: eval_cols = model.eval_recs else: eval_cols = None if eval_cols is not None: for i in eval_cols: log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i) if merge_col_logs: old_log = '{}_old'.format(log_file) os.rename(log_file, old_log) log_file_col = open(log_file, mode='w') setattr(self, 'log_file_col_{}'.format(i), log_file_col) writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames) setattr(self, 'writer_col_{}'.format(i), writer_col) if merge_col_logs: with open(old_log, newline='') as old: reader = csv.DictReader(old, fieldnames=('r', 'l', 't')) h = 0 for row in reader: if h > 1: row['t'] = 0.0001 * h # HACK: false times for past logs to maintain order writer_col.writerow(row) h += 1 os.remove(old_log) else: writer_col.writeheader() log_file_col.flush() self.args = eval_args
def __init__(self): import random import gym_city import game_of_life self.fieldnames = self.get_fieldnames() self.n_frames = 0 args = get_args() args.log_dir = args.save_dir + '/logs' assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' torch.manual_seed(args.seed) if args.cuda: print('CUDA ENABLED') torch.cuda.manual_seed(args.seed) graph_name = args.save_dir.split('trained_models/')[1].replace( '/', ' ') self.graph_name = graph_name actor_critic = False agent = False past_frames = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") self.device = device if args.vis: from visdom import Visdom viz = Visdom(port=args.port) self.viz = viz win = None self.win = win win_eval = None self.win_eval = win_eval print('env name: {}'.format(args.env_name)) if 'GameOfLife' in args.env_name: num_actions = 1 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if isinstance(envs.observation_space, gym.spaces.Discrete): num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if 'golmulti' in args.env_name.lower(): multi_env = True observation_space_shape = envs.observation_space.shape[1:] else: multi_env = False observation_space_shape = envs.observation_space.shape self.multi_env = multi_env if len(observation_space_shape) == 3: in_w = observation_space_shape[1] in_h = observation_space_shape[2] else: in_w = 1 in_h = 1 num_inputs = observation_space_shape[0] if isinstance(envs.action_space, gym.spaces.Discrete) or\ isinstance(envs.action_space, gym.spaces.Box): out_w = args.map_width out_h = args.map_width if 'Micropolis' in args.env_name: #otherwise it's set if args.power_puzzle: num_actions = 1 else: num_actions = 19 # TODO: have this already from env elif 'GameOfLife' in args.env_name: num_actions = 1 else: num_actions = envs.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): if len(envs.action_space.shape) == 3: out_w = envs.action_space.shape[1] out_h = envs.action_space.shape[2] elif len(envs.action_space.shape) == 1: out_w = 1 out_h = 1 num_actions = envs.action_space.shape[-1] print('num actions {}'.format(num_actions)) if args.auto_expand: args.n_recs -= 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy, 'prebuild': args.prebuild, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h }, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) if not agent: agent = init_agent(actor_critic, args) if args.auto_expand: args.n_recs += 1 evaluator = None self.evaluator = evaluator vec_norm = get_vec_normalize(envs) self.vec_norm = vec_norm #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if args.load_dir: saved_model = os.path.join(args.load_dir, args.env_name + '.tar') else: saved_model = os.path.join(args.save_dir, args.env_name + '.tar') self.checkpoint = None if os.path.exists(saved_model) and not args.overwrite: #print('current actor_critic params: {}'.format(actor_critic.parameters())) checkpoint = torch.load(saved_model) self.checkpoint = checkpoint saved_args = checkpoint['args'] actor_critic.load_state_dict(checkpoint['model_state_dict']) opt = agent.optimizer.state_dict() opt_load = checkpoint['optimizer_state_dict'] for o, l in zip(opt, opt_load): #print(o, l) param = opt[o] param_load = opt_load[l] #print('current: {}'.format(param), 'load: {}'.format(param_load)) #print(param_load.keys()) #params = param_load[0]['params'] #param[0]['params'] = params #for m, n in zip(param, param_load): # for p, q in zip(m, n): # print(p, q) # if type(m[p]) == list: # print(len(m[p]), len(n[q])) # agent.optimizer[m][p] = m[q] #print(agent.optimizer.state_dict()['param_groups']) #print('\n') #print(checkpoint['model_state_dict']) actor_critic.to(self.device) #actor_critic.cuda() #agent = init_agent(actor_critic, saved_args) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.auto_expand: if not args.n_recs - saved_args.n_recs == 1: print('can expand by 1 rec only from saved model, not {}'. format(args.n_recs - saved_args.n_recs)) raise Exception actor_critic.base.auto_expand() print('expanded net: \n{}'.format(actor_critic.base)) # TODO: Are we losing something crucial here? Probably not ideal. agent = init_agent(actor_critic, args) past_frames = checkpoint['n_frames'] ob_rms = checkpoint['ob_rms'] #past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step'] print('Resuming from frame {}'.format(past_frames)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms saved_args.num_frames = args.num_frames saved_args.vis_interval = args.vis_interval saved_args.eval_interval = args.eval_interval saved_args.overwrite = args.overwrite saved_args.n_recs = args.n_recs saved_args.intra_shr = args.intra_shr saved_args.inter_shr = args.inter_shr saved_args.map_width = args.map_width saved_args.render = args.render saved_args.print_map = args.print_map saved_args.load_dir = args.load_dir saved_args.experiment_name = args.experiment_name saved_args.log_dir = args.log_dir saved_args.save_dir = args.save_dir saved_args.num_processes = args.num_processes saved_args.n_chan = args.n_chan saved_args.prebuild = args.prebuild args = saved_args actor_critic.to(device) updates_remaining = int(args.num_frames - past_frames) // ( args.num_steps * args.num_processes) self.n_frames = self.n_frames + past_frames self.past_frames = past_frames if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size( ) else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() self.model = model = actor_critic.base self.reset_eval = False plotter = None env_param_bounds = envs.get_param_bounds() # in case we want to change this dynamically in the future (e.g., we may # not know how much traffic the agent can possibly produce in Micropolis) envs.set_param_bounds(env_param_bounds) # start with default bounds if args.model == 'FractalNet' or args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 else: n_cols = 0 col_step = 1 self.col_step = col_step env_param_bounds = envs.get_param_bounds() # in case we want to change this dynamically in the future (e.g., we may # not know how much traffic the agent can possibly produce in Micropolis) envs.set_param_bounds(env_param_bounds) # start with default bounds self.updates_remaining = updates_remaining self.envs = envs self.start = start self.rollouts = rollouts self.args = args self.actor_critic = actor_critic self.plotter = plotter self.agent = agent self.episode_rewards = episode_rewards self.n_cols = n_cols
def train(self): evaluator = self.evaluator episode_rewards = self.episode_rewards args = self.args actor_critic = self.actor_critic rollouts = self.rollouts agent = self.agent envs = self.envs plotter = self.plotter n_train = self.n_train start = self.start plotter = self.plotter n_cols = self.n_cols model = self.model device = self.device vec_norm = self.vec_norm n_frames = self.n_frames if self.reset_eval: obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) self.reset_eval = False if args.model == 'FractalNet' and args.drop_path: model.set_drop_path() if args.model == 'fixed' and model.RAND: model.num_recursions = random.randint(1, model.map_width * 2) self.player_act = None for self.n_step in range(args.num_steps): # Sample actions self.step() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update( rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) envs.dist_entropy = dist_entropy rollouts.after_update() total_num_steps = (n_train + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 #print(episode_rewards) #if torch.max(rollouts.rewards) > 0: # print(rollouts.rewards) if args.log and n_train % args.log_interval == 0 and len( episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.6f}/{:.6f}, min/max reward {:.6f}/{:.6f}\n \ dist entropy {:.6f}, val/act loss {:.6f}/{:.6f},".format( n_train, total_num_steps, int((self.n_frames - self.past_frames) / (end - start)), len(episode_rewards), round(np.mean(episode_rewards), 6), round(np.median(episode_rewards), 6), round(np.min(episode_rewards), 6), round(np.max(episode_rewards), 6), round(dist_entropy, 6), round(value_loss, 6), round(action_loss, 6))) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n".format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and n_train % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device, envs=envs, vec_norm=vec_norm, fieldnames=self.fieldnames) self.evaluator = evaluator col_idx = [-1, *[i for i in range(0, n_cols, self.col_step)]] for i in col_idx: evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step # making sure the evaluator plots the '-1'st column (the overall net) viz = self.viz win_eval = self.win_eval graph_name = self.graph_name if args.vis: #and n_train % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win_eval = evaluator.plotter.visdom_plot( viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs=col_idx) except IOError: pass #elif args.model == 'fixed' and model.RAND: # for i in model.eval_recs: # evaluator.evaluate(num_recursions=i) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames, n_graphs=model.eval_recs) #else: # evaluator.evaluate(column=-1) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames) self.reset_eval = True if args.save and n_train % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None) save_model = copy.deepcopy(actor_critic) save_agent = copy.deepcopy(agent) if args.cuda: save_model.cpu() optim_save = save_agent.optimizer.state_dict() self.agent = agent self.save_model = save_model self.optim_save = optim_save self.args = args self.ob_rms = ob_rms torch.save(self.get_save_dict(), os.path.join(save_path, args.env_name + ".tar")) #save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #save_agent = copy.deepcopy(agent) #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) print('model saved at {}'.format(save_path)) if args.vis and n_train % args.vis_interval == 0: if plotter is None: plotter = Plotter(n_cols, args.log_dir, args.num_processes) try: # Sometimes monitor doesn't properly flush the outputs viz = self.viz win = self.win graph_name = self.graph_name win = plotter.visdom_plot(viz, win, args.log_dir, graph_name, args.algo, args.num_frames) except IOError: pass
def main(): import random import gym_micropolis import game_of_life args = get_args() args.log_dir = args.save_dir + '/logs' assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ') actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None if 'GameOfLife' in args.env_name: print('env name: {}'.format(args.env_name)) num_actions = 1 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if isinstance(envs.observation_space, gym.spaces.Discrete): num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if len(envs.observation_space.shape) == 3: in_w = envs.observation_space.shape[1] in_h = envs.observation_space.shape[2] else: in_w = 1 in_h = 1 num_inputs = envs.observation_space.shape[0] if isinstance(envs.action_space, gym.spaces.Discrete): out_w = 1 out_h = 1 if 'Micropolis' in args.env_name: #otherwise it's set if args.power_puzzle: num_actions = 1 else: num_actions = 19 # TODO: have this already from env elif 'GameOfLife' in args.env_name: num_actions = 1 else: num_actions = envs.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): if len(envs.action_space.shape) == 3: out_w = envs.action_space.shape[1] out_h = envs.action_space.shape[2] elif len(envs.action_space.shape) == 1: out_w = 1 out_h = 1 num_actions = envs.action_space.shape[-1] print('num actions {}'.format(num_actions)) if args.auto_expand: args.n_recs -= 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) if args.auto_expand: args.n_recs += 1 evaluator = None if not agent: agent = init_agent(actor_critic, args) #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if args.load_dir: saved_model = os.path.join(args.load_dir, args.env_name + '.tar') else: saved_model = os.path.join(args.save_dir, args.env_name + '.tar') vec_norm = get_vec_normalize(envs) if os.path.exists(saved_model) and not args.overwrite: checkpoint = torch.load(saved_model) saved_args = checkpoint['args'] actor_critic.load_state_dict(checkpoint['model_state_dict']) #for o, l in zip(agent.optimizer.state_dict, checkpoint['optimizer_state_dict']): # print(o, l) #print(agent.optimizer.state_dict()['param_groups']) #print('\n') #print(checkpoint['model_state_dict']) actor_critic.to(device) actor_critic.cuda() #agent = init_agent(actor_critic, saved_args) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.auto_expand: if not args.n_recs - saved_args.n_recs == 1: print('can expand by 1 rec only from saved model, not {}'.format(args.n_recs - saved_args.n_recs)) raise Exception actor_critic.base.auto_expand() print('expanded net: \n{}'.format(actor_critic.base)) past_steps = checkpoint['past_steps'] ob_rms = checkpoint['ob_rms'] past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step'] print('Resuming from step {}'.format(past_steps)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms saved_args.num_frames = args.num_frames saved_args.vis_interval = args.vis_interval saved_args.eval_interval = args.eval_interval saved_args.overwrite = args.overwrite saved_args.n_recs = args.n_recs saved_args.intra_shr = args.intra_shr saved_args.inter_shr = args.inter_shr saved_args.map_width = args.map_width saved_args.render = args.render saved_args.print_map = args.print_map saved_args.load_dir = args.load_dir saved_args.experiment_name = args.experiment_name saved_args.log_dir = args.log_dir saved_args.save_dir = args.save_dir args = saved_args actor_critic.to(device) if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size() else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() model = actor_critic.base reset_eval = False plotter = None if args.model == 'FractalNet' or args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 else: n_cols = 0 col_step = 1 for j in range(past_steps, num_updates): if reset_eval: print('post eval reset') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) reset_eval = False #if np.random.rand(1) < 0.1: # envs.venv.venv.remotes[1].send(('setRewardWeights', None)) if args.model == 'FractalNet' and args.drop_path: #if args.intra_shr and args.inter_shr: # n_recs = np.randint # model.set_n_recs() model.set_drop_path() if args.model == 'fixed' and model.RAND: model.num_recursions = random.randint(1, model.map_width * 2) player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): if args.render: if args.num_processes == 1: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.venv.venv.render() else: pass else: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.render() envs.venv.venv.render() else: pass #envs.venv.venv.remotes[0].send(('render', None)) #envs.venv.venv.remotes[0].recv() value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity, deterministic=False) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin) ) intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},". format(j, total_num_steps, int((total_num_steps - past_steps * args.num_processes * args.num_steps) / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n". format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device, envs=envs, vec_norm=vec_norm) model = evaluator.actor_critic.base col_idx = [-1, *range(0, n_cols, col_step)] for i in col_idx: evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step # making sure the evaluator plots the '-1'st column (the overall net) if args.vis: #and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win_eval = evaluator.plotter.visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs= col_idx) except IOError: pass #elif args.model == 'fixed' and model.RAND: # for i in model.eval_recs: # evaluator.evaluate(num_recursions=i) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames, n_graphs=model.eval_recs) #else: # evaluator.evaluate(column=-1) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames) reset_eval = True if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None) save_model = copy.deepcopy(actor_critic) save_agent = copy.deepcopy(agent) if args.cuda: save_model.cpu() optim_save = save_agent.optimizer.state_dict() # experimental: torch.save({ 'past_steps': next(iter(agent.optimizer.state_dict()['state'].values()))['step'], 'model_state_dict': save_model.state_dict(), 'optimizer_state_dict': optim_save, 'ob_rms': ob_rms, 'args': args }, os.path.join(save_path, args.env_name + ".tar")) #save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #save_agent = copy.deepcopy(agent) #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) if args.vis and j % args.vis_interval == 0: if plotter is None: plotter = Plotter(n_cols, args.log_dir, args.num_processes) try: # Sometimes monitor doesn't properly flush the outputs win = plotter.visdom_plot(viz, win, args.log_dir, graph_name, args.algo, args.num_frames) except IOError: pass
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) print('here') if args.env_name == 'Reacher-v2': rbf1 = build_features_reacher2(.2, 5, 2) len_rbf = rbf1._K len_features = len_rbf + 1 if args.env_name == 'Hopper-v2': len_features = 3 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, len_features) print('here2') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) num_updates = 20 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # Prepare demos demo_actions = np.zeros( (1, args.num_processes, envs.action_space.shape[0])) demo_states = np.zeros( (1, args.num_processes, envs.observation_space.shape[0])) demo_features = np.zeros((1, args.num_processes, len_features)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # obs, reward and next obs demo_actions = np.concatenate( [demo_actions, action.reshape(1, args.num_processes, -1)], 0) demo_states = np.concatenate([ demo_states, rollouts.obs[step].reshape( 1, args.num_processes, -1) ], 0) feat_rewards = np.zeros((args.num_processes, len_features)) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_before = envs.get_sim_data() obs, reward, done, infos = envs.step(action) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_after = envs.get_sim_data() for num_p in range(args.num_processes): feat_1 = pos_after[num_p] - pos_before[num_p] feat_2 = 0 if not done[num_p]: feat_2 = 1 # feat_2 = np.array([1 for _ in range(args.num_processes)]) feat_3 = np.array( [np.linalg.norm(action[num_p], ord=2)**2]).flatten() feat_rewards[num_p] = np.array( [feat_1, feat_2, feat_3]) if args.env_name == 'Reacher-v2': if args.num_processes > 1: body_data = envs.get_body_data() for num_p in range(args.num_processes): rbf1_ = rbf1(body_data[num_p][:-1]) rbf4_ = np.array( [np.linalg.norm(action[num_p], ord=2)**2]) feat_rewards[num_p] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) else: rbf1_ = rbf1( (envs.envs[0].env.env.get_body_com("fingertip") - envs.envs[0].env.env.get_body_com("target"))[:-1]) rbf4_ = np.array([-np.square(action[0]).sum()]) feat_rewards[0] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) demo_features = np.concatenate([ demo_features, feat_rewards.reshape(1, args.num_processes, -1) ], 0) if step > 1 and step % 1000 == 0: done = [True for _ in range(args.num_processes)] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, \ value, reward, masks, feat_rewards) # Save demos: action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy' state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy' rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str( j) + '.npy' policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth' np.save(action_file_name, demo_actions) np.save(state_file_name, demo_states) np.save(rew_feat_file_name, demo_features) torch.save(actor_critic.state_dict(), policy_file_name) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) #print(acc_scores) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act(obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if os.path.exists(saved_model) and not args.overwrite: actor_critic, ob_rms = \ torch.load(saved_model) agent = \ torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) for i in agent.optimizer.state_dict(): print(dir(agent.optimizer)) print(getattr(agent.optimizer, 'steps')) print(agent.optimizer.state_dict()[i]) past_steps = agent.optimizer.steps else: actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if actor_critic: pass # vec_norm = get_vec_normalize(envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': 18, 'recurrent': args.recurrent_policy}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) actor_critic.to(device) evaluator = None if not agent: if args.algo == 'a2c': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, curiosity=args.curiosity, args=args) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True, curiosity=args.curiosity, args=args) if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates - past_steps): if args.drop_path: actor_critic.base.get_drop_path() player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin) ) intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) save_agent = copy.deepcopy(agent) torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n". format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device) if args.model == 'fractal': n_cols = evaluator.actor_critic.base.n_cols for i in range(-1, n_cols): evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, args.env_name, args.algo, args.num_frames, n_graphs=args.n_recs) else: evaluator.evaluate(column=None) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now( ).strftime("%Y-%m-%d-%H-%M-%S-%f") log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir, args.save_dir) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, args.add_timestep, device, False, frame_skip=args.frame_skip) if args.load_path: actor_critic, _ob_rms = torch.load(args.load_path) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.train() vec_norm.ob_rms = _ob_rms actor_critic.train() else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, beta=args.beta_dist, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, beta=args.sil_beta, value_loss_coef=args.sil_value_loss_coef, entropy_coef=args.sil_entropy_coef) replay = ReplayStorage(10000, num_processes=args.num_processes, gamma=args.gamma, prio_alpha=args.sil_alpha, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=actor_critic. recurrent_hidden_state_size, device=device) else: replay = None action_high = torch.from_numpy(envs.action_space.high).to(device) action_low = torch.from_numpy(envs.action_space.low).to(device) action_mid = 0.5 * (action_high + action_low) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) benchmark_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): # sample actions value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): clipped_action = action.clone() if args.shift_action: # FIXME experimenting with this, so far resulting in # faster learning when clipping guassian continuous # output (vs leaving centred at 0 and unscaled) clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) else: clipped_action = action # act in environment and observe obs, reward, done, infos = envs.step(clipped_action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if 'rb' in info['episode']: benchmark_rewards.append(info['episode']['rb']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update( rollouts, j, replay) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps train_eprew = np.mean(episode_rewards) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), train_eprew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end='') if len(benchmark_rewards): print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format( np.mean(benchmark_rewards), np.median(benchmark_rewards), np.min(benchmark_rewards), np.max(benchmark_rewards)), end='') print() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) clipped_action = action if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): if args.shift_action: clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) obs, reward, done, infos = eval_envs.step(clipped_action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() eval_eprew = np.mean(eval_episode_rewards) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), eval_eprew)) if len(episode_rewards ) and j % args.save_interval == 0 and save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] ep_rewstr = ("%d" % train_eprew).replace("-", "n") save_filename = os.path.join( save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr)) torch.save(save_model, save_filename) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
args = parser.parse_args() args.det = not args.non_det env = make_vec_envs(args.env_name, args.seed + 1000, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) # Get a render function render_func = get_render_func(env) # We need to use the same statistics for normalization as used in training actor_critic, ob_rms = \ torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) vec_norm = get_vec_normalize(env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) if render_func is not None: render_func('human') obs = env.reset() if args.env_name.find('Bullet') > -1: import pybullet as p
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) vizz = Visdom(port=args.port) win = None winloss = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) #Initialize bw Model if args.bw: bw_model = bw_module(actor_critic, args, agent.optimizer, envs.action_space, envs.observation_space) vis_timesteps = [] vis_loss = [] obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # Add stuff to the Buffer if args.bw: bw_model.step(rollouts.obs[step].detach().cpu().numpy(), action.detach().cpu().numpy(), reward.detach().cpu().numpy(), done, obs.detach().cpu().numpy()) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Do BW STEPS if args.bw and (j % args.n_a2c == 0): if not args.consistency: l_bw, l_imi = 0.0, 0.0 for _ in range(args.n_bw): l_bw += bw_model.train_bw_model(j) l_bw /= args.n_bw for _ in range(args.n_imi): l_imi += bw_model.train_imitation(j) l_imi /= args.n_imi else: l_bw, l_fw = 0.0, 0.0 for _ in range(args.n_bw): l_bw_, l_fw_ = bw_model.train_bw_model(j) l_bw += l_bw_ l_fw += l_fw_ l_bw /= args.n_bw l_fw /= args.n_bw l_imi, l_cons = 0.0, 0.0 for _ in range(args.n_imi): l_imi_, l_cons_ = bw_model.train_imitation(j) l_imi += l_imi_ l_cons_ += l_cons_ l_imi /= args.n_imi l_cons /= args.n_imi if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs env_name = args.env_name if args.bw: env_name += 'BW' win = visdom_plot(viz, win, args.log_dir, env_name, args.algo, args.num_frames) except IOError: pass # Save to Visdom Plots if args.vis and (j % args.vis_interval == 0): if args.bw and args.consistency: vis_loss.append( [value_loss, action_loss, l_bw, l_imi, l_fw, l_cons]) legend = [ 'Value loss', 'Action loss', 'BW Loss', 'IMI loss', 'FW Loss', 'CONST loss' ] title = args.env_name + '-' + 'bw' + '-' + 'consistency' + args.title elif args.bw: vis_loss.append([value_loss, action_loss, l_bw, l_imi]) legend = ['Value loss', 'Action loss', 'BW Loss', 'IMI loss'] title = args.env_name + '-' + 'bw' + args.title else: vis_loss.append([value_loss, action_loss]) legend = ['Value loss', 'Action loss'] title = args.env_name + '-' + 'vanilla' vis_timesteps.append( (j + 1) * (args.num_processes * args.num_steps)) # vis_rewards.append(final_rewards.mean()) # vis_rewards.append(np.mean(reward_queue)) # if win is None: # win = vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps', # ylabel='Avg Rewards')) # vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), win=win, update='replace', opts=dict(title=title, xlabel='Timesteps', # ylabel='Avg Rewards')) if winloss is None: winloss = vizz.line(Y=np.array(vis_loss), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps', ylabel='Losses', legend=legend)) vizz.line(Y=np.array(vis_loss), X=np.array(vis_timesteps), win=winloss, update='replace', opts=dict(title=title, xlabel='Timesteps', ylabel='Losses', legend=legend))