def evaluate(actor_critic, env_name, seed, num_processes): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes) eval_episode_rewards = [] obs = eval_envs.reset() sum_re = torch.zeros(num_processes, 1) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, = actor_critic.act(obs, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: eval_episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 eval_envs.close() log = " Evaluation using {} episodes: mean reward {:.5f}".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)) return log
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, device, is_limit_action=False): # print('start making eva envs') eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, device, gamma=None) # print('end of making') norm_envs = get_vec_normalize(eval_envs) norm_envs.eval() norm_envs.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() # print(obs) # ss('haha') eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) sum_re = torch.zeros(num_processes, 1) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # action = action + 1 # print(action) # Obser reward and next obs if is_limit_action: obs, reward, done, infos = eval_envs.step(action + 1) else: obs, reward, done, infos = eval_envs.step(action) sum_re += reward if any(done): # print(infos) for i in range(len(done)): if done[i]: eval_episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 eval_envs.close() log = " Evaluation using {} episodes: mean reward {:.5f}".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)) return log
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device, custom_gym, save_path): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True, custom_gym) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] eval_episode_length = [] eval_episode_success_rate = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < num_processes * 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_episode_length.append(info['episode']['l']) eval_episode_success_rate.append( info['was_successful_trajectory']) eval_envs.close() print( " Evaluation using {} episodes: mean reward {:.5f}, mean_length {:.2f}, mean_success {:.2f} \n" .format(len(eval_episode_rewards), np.mean(eval_episode_rewards), np.mean(eval_episode_length), np.mean(eval_episode_success_rate))) if actor_critic.max_eval_success_rate <= np.mean( eval_episode_success_rate): actor_critic.max_eval_success_rate = np.mean(eval_episode_success_rate) torch.save([ actor_critic, getattr(utils.get_vec_normalize(eval_envs), 'ob_rms', None) ], os.path.join(save_path, str(seed) + "_best_test.pt"))
def __init__(self, **args): torch.set_num_threads(1) self.load_dir = args['load_dir'] self.det = args['deterministic_evaluation'] self.algorithm = args['algorithm'] self.env_name = args['env_name'] self.grayscale = args['grayscale'] self.skip_frame = args['skip_frame'] self.num_frame_stack = args['num_frame_stack'] self.scale = args['reward_scaling'] self.seed = args['seed'] try: os.makedirs(args['log_dir']) except OSError: files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv')) for f in files: os.remove(f) self.eval_log_dir = args['log_dir'] + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.env = make_vec_envs(self.env_name, self.seed + 1000, 1, None, None, 'cpu', False, self.grayscale, self.skip_frame, self.scale, num_frame_stack=self.num_frame_stack) # Get a render function self.render_func = get_render_func(self.env) # We need to use the same statistics for normalization as used in training self.actor_critic, self.ob_rms = \ torch.load(os.path.join(self.load_dir, self.algorithm, self.env_name + ".pt"), map_location='cpu') self.actor_critic.to('cpu') self.vec_norm = get_vec_normalize(self.env) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = self.ob_rms
def test(): from envs import make_vec_envs envs = make_vec_envs('PongNoFrameskip-v4', 2018, 2, 0.99, './gym/', True, 'cuda:0', False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) print(actor_critic.get_weight_vector().shape) print(sum(p.numel() for p in actor_critic.parameters() if p.requires_grad)) zero = np.zeros(actor_critic.get_weight_vector().shape) actor_critic.set_weight_vector(zero, device='cuda:0')
def __init__(self, args, actor_critic, device): eval_args = args #eval_args.render = True self.device = device #if args.model == 'fractal': # for i in range(-1, args.n_recs): # eval_log_dir = args.log_dir + "_eval_col_{}".format(i) # try: # os.makedirs(eval_log_dir) # except OSError: # files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) # for f in files: # os.remove(f) # setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir) self.eval_log_dir = args.log_dir + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.num_eval_processes = 2 self.eval_envs = make_vec_envs( eval_args.env_name, eval_args.seed + self.num_eval_processes, self.num_eval_processes, eval_args.gamma, self.eval_log_dir, eval_args.add_timestep, self.device, True, args=eval_args) self.vec_norm = get_vec_normalize(self.eval_envs) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms self.actor_critic = actor_critic self.tstart = time.time() fieldnames = ['r', 'l', 't'] if args.model == 'fractal': n_cols = actor_critic.base.n_cols for i in range(-1, n_cols): log_file_col = open('{}/col_{}_eval.csv'.format(self.eval_log_dir, i), mode='w') setattr(self, 'log_file_col_{}'.format(i), log_file_col) writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames) setattr(self, 'writer_col_{}'.format(i), writer_col) writer_col.writeheader() log_file_col.flush() else: self.log_file = open('{}/col_evals.csv'.format(self.eval_log_dir), mode='w') self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.log_file.flush() self.args = eval_args
def main(): num_episodes = int(args.num_eval_episodes) args.device = torch.device("cuda:0" if args.cuda else "cpu") torch.set_num_threads(1) envs = make_vec_envs(args) obs, infos = envs.reset() for ep_num in range(num_episodes): for step in range(args.max_episode_length): action = torch.randint(0, 3, (args.num_processes, )) obs, rew, done, infos = envs.step(action) if done: break print("Test successfully completed")
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): log_name = 'ppo_no_input_process' train_log = Log(log_name + '_train_log') evl_log = Log(log_name + '_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs(args_env_name, args_seed, args_num_processes) actor_critic = Policy(envs.observation_space.shape, envs.action_space) agent = PPO(actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm) rollouts = RolloutStorage(args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1]) rollouts.compute_returns(next_value, args_gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ev_result = evaluate(actor_critic, args_env_name, args_seed, args_num_processes) ev_log_string = 'steps:' + str(total_num_steps) + '. ' + ev_result evl_log.log(ev_log_string)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
device = torch.device("cuda" if use_cuda else "cpu") from collections import deque # num_envs = 1 # env_name = 'BreakoutNoFrameskip-v4' env_name = 'PongNoFrameskip-v4' # baselines' env.make from envs import make_vec_envs num_steps = 5 num_processes = 16 envs = make_vec_envs(env_name, 1, num_processes, 0.99, '/home/realiti/Desktop/tmp', device, False) # fix for ubuntu def init(module, weight_init, bias_init, gain=1): weight_init(module.weight.data, gain=gain) bias_init(module.bias.data) return module class Flatten(nn.Module): def forward(self, x): return x.view(x.size(0), -1) class Model(nn.Module):
parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA') parser.add_argument('--no-realtime', action='store_true', default=False, help='disables realtime mode and rendering for obt env') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.realtime = not args.no_realtime torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") num_env = 1 env = make_vec_envs(args.env_name, args.seed + 1000, num_env, gamma=None, no_norm=args.no_norm, num_stack=args.num_stack, log_dir=None, add_timestep=args.add_timestep, device=device, eval=True, allow_early_resets=False, realtime=args.realtime) # Get a render function render_func = None tmp_env = env while True: if hasattr(tmp_env, 'envs'): render_func = tmp_env.envs[0].render break elif hasattr(tmp_env, 'venv'): tmp_env = tmp_env.venv elif hasattr(tmp_env, 'env'): tmp_env = tmp_env.env else: break
def main(): args = get_args() args.num_processes = 16 args.env_name = 'BreakoutNoFrameskip-v4' torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def __init__(self, args, actor_critic, device, envs=None, vec_norm=None, frozen=False): ''' frozen: we are not in the main training loop, but evaluating frozen model separately''' if frozen: self.win_eval = None past_steps = args.past_steps self.frozen = frozen #eval_args.render = True self.device = device #if args.model == 'fractal': # for i in range(-1, args.n_recs): # eval_log_dir = args.log_dir + "_eval_col_{}".format(i) # try: # os.makedirs(eval_log_dir) # except OSError: # files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) # for f in files: # os.remove(f) # setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir) if frozen: if 'GameOfLife' in args.env_name: self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s_{}pl".format(past_steps, args.map_width, args.n_recs, args.max_step, args.prob_life, '.1f') else: self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s".format(past_steps, args.map_width, args.n_recs, args.max_step, '.1f') merge_col_logs = True else: self.eval_log_dir = args.log_dir + "_eval" merge_col_logs = False try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) files += glob.glob(os.path.join(self.eval_log_dir, '*_eval.csv')) if args.overwrite: for f in files: os.remove(f) elif files: merge_col_logs = True self.args = args self.actor_critic = actor_critic self.num_eval_processes = args.num_processes if envs: self.eval_envs = envs self.vec_norm = vec_norm else: #print('making envs in Evaluator: ', self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes, # self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, True, self.args) self.eval_envs = make_vec_envs( self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes, self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, False, args=self.args) self.vec_norm = get_vec_normalize(self.eval_envs) if self.vec_norm is not None: self.vec_norm.eval() self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms self.tstart = time.time() fieldnames = ['r', 'l', 't'] model = actor_critic.base if args.model == 'FractalNet' or args.model =='fractal': n_cols = model.n_cols else: n_cols = 0 self.plotter = Plotter(n_cols, self.eval_log_dir, self.num_eval_processes, max_steps=self.args.max_step) eval_cols = range(-1, n_cols) if args.model == 'fixed' and model.RAND: eval_cols = model.eval_recs if eval_cols is not None: for i in eval_cols: log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i) if merge_col_logs and os.path.exists(log_file): merge_col_log = True else: merge_col_log = False if merge_col_log: if len(eval_cols) > 1 and i == eval_cols[-2] and self.args.auto_expand: # problem if we saved model after auto-expanding, without first evaluating! # for the newly added column, we duplicate the last col.'s records new_col_log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i + 1) copyfile(log_file, new_col_log_file) old_log = '{}_old'.format(log_file) os.rename(log_file, old_log) log_file_col = open(log_file, mode='w') setattr(self, 'log_file_col_{}'.format(i), log_file_col) writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames) setattr(self, 'writer_col_{}'.format(i), writer_col) if merge_col_log: with open(old_log, newline='') as old: reader = csv.DictReader(old, fieldnames=('r', 'l', 't')) h = 0 try: # in case of null bytes resulting from interrupted logging for row in reader: if h > 1: row['t'] = 0.0001 * h # HACK: false times for past logs to maintain order writer_col.writerow(row) h += 1 except csv.Error: h_i = 0 for row in reader: if h_i > h: row['t'] = 0.0001 * h_i # HACK: false times for past logs to maintain order writer_col.writerow(row) h_i += 1 os.remove(old_log) else: writer_col.writeheader() log_file_col.flush()
def main(): import random import gym_micropolis import game_of_life args = get_args() args.log_dir = args.save_dir + '/logs' assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ') actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None if 'GameOfLife' in args.env_name: print('env name: {}'.format(args.env_name)) num_actions = 1 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if isinstance(envs.observation_space, gym.spaces.Discrete): num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if len(envs.observation_space.shape) == 3: in_w = envs.observation_space.shape[1] in_h = envs.observation_space.shape[2] else: in_w = 1 in_h = 1 num_inputs = envs.observation_space.shape[0] if isinstance(envs.action_space, gym.spaces.Discrete): out_w = 1 out_h = 1 if 'Micropolis' in args.env_name: #otherwise it's set if args.power_puzzle: num_actions = 1 else: num_actions = 19 # TODO: have this already from env elif 'GameOfLife' in args.env_name: num_actions = 1 else: num_actions = envs.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): if len(envs.action_space.shape) == 3: out_w = envs.action_space.shape[1] out_h = envs.action_space.shape[2] elif len(envs.action_space.shape) == 1: out_w = 1 out_h = 1 num_actions = envs.action_space.shape[-1] print('num actions {}'.format(num_actions)) if args.auto_expand: args.n_recs -= 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) if args.auto_expand: args.n_recs += 1 evaluator = None if not agent: agent = init_agent(actor_critic, args) #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if args.load_dir: saved_model = os.path.join(args.load_dir, args.env_name + '.tar') else: saved_model = os.path.join(args.save_dir, args.env_name + '.tar') vec_norm = get_vec_normalize(envs) if os.path.exists(saved_model) and not args.overwrite: checkpoint = torch.load(saved_model) saved_args = checkpoint['args'] actor_critic.load_state_dict(checkpoint['model_state_dict']) #for o, l in zip(agent.optimizer.state_dict, checkpoint['optimizer_state_dict']): # print(o, l) #print(agent.optimizer.state_dict()['param_groups']) #print('\n') #print(checkpoint['model_state_dict']) actor_critic.to(device) actor_critic.cuda() #agent = init_agent(actor_critic, saved_args) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.auto_expand: if not args.n_recs - saved_args.n_recs == 1: print('can expand by 1 rec only from saved model, not {}'.format(args.n_recs - saved_args.n_recs)) raise Exception actor_critic.base.auto_expand() print('expanded net: \n{}'.format(actor_critic.base)) past_steps = checkpoint['past_steps'] ob_rms = checkpoint['ob_rms'] past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step'] print('Resuming from step {}'.format(past_steps)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms saved_args.num_frames = args.num_frames saved_args.vis_interval = args.vis_interval saved_args.eval_interval = args.eval_interval saved_args.overwrite = args.overwrite saved_args.n_recs = args.n_recs saved_args.intra_shr = args.intra_shr saved_args.inter_shr = args.inter_shr saved_args.map_width = args.map_width saved_args.render = args.render saved_args.print_map = args.print_map saved_args.load_dir = args.load_dir saved_args.experiment_name = args.experiment_name saved_args.log_dir = args.log_dir saved_args.save_dir = args.save_dir args = saved_args actor_critic.to(device) if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size() else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() model = actor_critic.base reset_eval = False plotter = None if args.model == 'FractalNet' or args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 else: n_cols = 0 col_step = 1 for j in range(past_steps, num_updates): if reset_eval: print('post eval reset') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) reset_eval = False #if np.random.rand(1) < 0.1: # envs.venv.venv.remotes[1].send(('setRewardWeights', None)) if args.model == 'FractalNet' and args.drop_path: #if args.intra_shr and args.inter_shr: # n_recs = np.randint # model.set_n_recs() model.set_drop_path() if args.model == 'fixed' and model.RAND: model.num_recursions = random.randint(1, model.map_width * 2) player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): if args.render: if args.num_processes == 1: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.venv.venv.render() else: pass else: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.render() envs.venv.venv.render() else: pass #envs.venv.venv.remotes[0].send(('render', None)) #envs.venv.venv.remotes[0].recv() value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity, deterministic=False) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin) ) intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},". format(j, total_num_steps, int((total_num_steps - past_steps * args.num_processes * args.num_steps) / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n". format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device, envs=envs, vec_norm=vec_norm) model = evaluator.actor_critic.base col_idx = [-1, *range(0, n_cols, col_step)] for i in col_idx: evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step # making sure the evaluator plots the '-1'st column (the overall net) if args.vis: #and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win_eval = evaluator.plotter.visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs= col_idx) except IOError: pass #elif args.model == 'fixed' and model.RAND: # for i in model.eval_recs: # evaluator.evaluate(num_recursions=i) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames, n_graphs=model.eval_recs) #else: # evaluator.evaluate(column=-1) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames) reset_eval = True if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None) save_model = copy.deepcopy(actor_critic) save_agent = copy.deepcopy(agent) if args.cuda: save_model.cpu() optim_save = save_agent.optimizer.state_dict() # experimental: torch.save({ 'past_steps': next(iter(agent.optimizer.state_dict()['state'].values()))['step'], 'model_state_dict': save_model.state_dict(), 'optimizer_state_dict': optim_save, 'ob_rms': ob_rms, 'args': args }, os.path.join(save_path, args.env_name + ".tar")) #save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #save_agent = copy.deepcopy(agent) #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) if args.vis and j % args.vis_interval == 0: if plotter is None: plotter = Plotter(n_cols, args.log_dir, args.num_processes) try: # Sometimes monitor doesn't properly flush the outputs win = plotter.visdom_plot(viz, win, args.log_dir, graph_name, args.algo, args.num_frames) except IOError: pass
#past_steps = checkpoint['past_steps'] #args.past_steps = past_steps env_name = saved_args.env_name if 'Micropolis' in env_name: args.power_puzzle = saved_args.power_puzzle if not args.evaluate and not 'GoLMulti' in env_name: # assume we just want to observe/interact w/ a single env. args.num_proc = 1 dummy_args = args env = make_vec_envs(env_name, args.seed + 1000, 1, None, args.load_dir, args.add_timestep, device=device, allow_early_resets=False, args=dummy_args) print(args.load_dir) # Get a render function # render_func = get_render_func(env) if isinstance(env.observation_space, gym.spaces.Discrete): in_width = 1 num_inputs = env.observation_space.n elif isinstance(env.observation_space, gym.spaces.Box): if len(env.observation_space.shape) == 3: in_w = env.observation_space.shape[1]
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic.load_state_dict(actor_critic.state_dict()) actor_critic.to(device) average_actor_critic.to(device) agent = algo.ACER_AGENT(actor_critic, average_actor_critic, args.value_loss_coef, args.entropy_coef, args.gamma, args.clip, args.no_trust_region, args.alpha, args.delta, lr=args.lr, eps=args.eps, rms_alpha=args.rms_alpha, max_grad_norm=args.max_grad_norm) buffer = Buffer(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args.buffer_size) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) off_rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) off_rollouts.to(device) episode_rewards = deque(maxlen=10) acer = algo.ACER(actor_critic, rollouts, off_rollouts, buffer, episode_rewards, agent, envs) start = time.time() for j in range(num_updates): # On-policy ACER value_loss, action_loss, dist_entropy = acer.call(on_policy=True) if args.replay_ratio > 0 and buffer.has_atleast(args.replay_start): # Off-policy ACER n = np.random.poisson(args.replay_ratio) for _ in range(n): acer.call(on_policy=False) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\ndist_entropy {:.1f}, value/action loss {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) eval_episode_rewards = [] obs = eval_envs.reset().to(device) eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, _, _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) obs = obs.to(device) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) # norm_envs = get_vec_normalize(envs) # norm_envs = envs # norm_envs.eval() # norm_envs.ob_rms = 1 # print(envs.ob_rms) # ss('hi') if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_recurrent_policy}) actor_critic.to(device) # print(actor_critic.is_recurrent) # print(actor_critic.gru) # ss('hi') agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): if args_use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, args_lr) for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # ss('dissecting actor critic. act') # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action+1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
default='PongNoFrameskip-v4', help='environment to train on (default: PongNoFrameskip-v4)') parser.add_argument( '--load-dir', default='./trained_models/', help='directory to save agent logs (default: ./trained_models/)') parser.add_argument('--add-timestep', action='store_true', default=False, help='add timestep to observations') args = parser.parse_args() env = make_vec_envs(args.env_name, args.seed + 1000, 1, None, None, args.add_timestep, device='cpu') # Get a render function render_func = None tmp_env = env while True: if hasattr(tmp_env, 'envs'): render_func = tmp_env.envs[0].render break elif hasattr(tmp_env, 'venv'): tmp_env = tmp_env.venv elif hasattr(tmp_env, 'env'): tmp_env = tmp_env.env
def main(): saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if os.path.exists(saved_model) and not args.overwrite: actor_critic, ob_rms = \ torch.load(saved_model) agent = \ torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) for i in agent.optimizer.state_dict(): print(dir(agent.optimizer)) print(getattr(agent.optimizer, 'steps')) print(agent.optimizer.state_dict()[i]) past_steps = agent.optimizer.steps else: actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if actor_critic: pass # vec_norm = get_vec_normalize(envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': 18, 'recurrent': args.recurrent_policy}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) actor_critic.to(device) evaluator = None if not agent: if args.algo == 'a2c': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, curiosity=args.curiosity, args=args) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True, curiosity=args.curiosity, args=args) if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates - past_steps): if args.drop_path: actor_critic.base.get_drop_path() player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin) ) intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) save_agent = copy.deepcopy(agent) torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n". format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device) if args.model == 'fractal': n_cols = evaluator.actor_critic.base.n_cols for i in range(-1, n_cols): evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, args.env_name, args.algo, args.num_frames, n_graphs=args.n_recs) else: evaluator.evaluate(column=None) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, args.custom_gym) base = SEVN actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'ppo': agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() writer.add_scalars('Train/Episode Reward', { "Reward Mean": np.mean(episode_rewards), "Reward Min": np.min(episode_rewards), "Reward Max": np.max(episode_rewards) }, global_step=total_num_steps) writer.add_scalars('Train/Episode Length', { "Episode Length Mean": np.mean(episode_length), "Episode Length Min": np.min(episode_length), "Episode Length Max": np.max(episode_length) }, global_step=total_num_steps) writer.add_scalar("Train/Episode Reward Mean", np.mean(episode_rewards), global_step=total_num_steps) writer.add_scalar("Train/Episode Length Mean", np.mean(episode_length), global_step=total_num_steps) writer.add_scalar("Train/Episode Success Rate", np.mean(episode_success_rate), global_step=total_num_steps) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(_): if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) with open(f"configs/{FLAGS.algo}.yaml") as file: kwargs = yaml.load(file, Loader=yaml.FullLoader) os.makedirs(FLAGS.logs_dir, exist_ok=True) tf.random.set_seed(FLAGS.seed) envs = make_vec_envs(FLAGS.env_name, FLAGS.seed, kwargs['num_processes'], FLAGS.logs_dir) for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) def get_obs(): return envs.stackedobs def env_step(action): next_obs, reward, done, _ = envs.step(action) return next_obs, reward.astype(np.float32), done.astype(np.float32) batch_size = kwargs['num_steps'] * kwargs['num_processes'] if FLAGS.algo == 'ppo': actor_critic = PPO((-1, *envs.observation_space.shape), envs.action_space.n, FLAGS.entropy_coef, FLAGS.value_loss_coef, FLAGS.gamma, **kwargs) else: del kwargs['num_processes'] actor_critic = A2C((-1, *envs.observation_space.shape), envs.action_space.n, FLAGS.entropy_coef, FLAGS.value_loss_coef, FLAGS.gamma, **kwargs) num_updates = FLAGS.max_timesteps // batch_size val_loss, act_loss, ent_loss = 0, 0, 0 hparam_str = utils.get_haram_str(env_name=FLAGS.env_name, seed=FLAGS.seed) writer = tf.summary.create_file_writer( os.path.join(FLAGS.save_dir, 'tb', hparam_str)) writer.set_as_default() envs.reset() for i in tqdm(range(num_updates), unit_scale=batch_size, smoothing=0.1): actor_critic.set_learning_rate(kwargs['learning_rate'] * (1.0 - i / num_updates)) value_loss, action_loss, entropy_loss = actor_critic.update( env_step, get_obs) val_loss += value_loss act_loss += action_loss ent_loss += entropy_loss if i % FLAGS.log_interval == 0 and i > 0: tf.summary.scalar("losses/value_loss", val_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.scalar("losses/action_loss", act_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.scalar("losses/entropy_loss", ent_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.flush() val_loss = 0 act_loss = 0 ent_loss = 0
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # print('Initializing visdom') # if args.vis: # from visdom import Visdom # viz = Visdom(port=args.port) # win = None print('Creating envs') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) # ===================== TB visualisation ================= writer = SummaryWriter() last_index = 0 print('Starting ! ') start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('Agents metrics/Policy loss', action_loss, j) writer.add_scalar('Agents metrics/Value loss', value_loss, j) writer.add_scalar('Agents metrics/Entropy loss', dist_entropy, j) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs # win, tx, ty = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) tx, ty = get_reward_log(args.log_dir) if tx != None and ty != None: max_index = len(tx) for ind_iter in range(last_index, max_index): writer.add_scalar('Reward', ty[ind_iter], tx[ind_iter]) last_index = max_index # tx, ty = get_reward_log(viz, win, args.log_dir, args.env_name, # args.algo, args.num_frames) # if tx != None and ty != None: # plt.cla() # plt.plot(tx,ty) # plt.pause(0.1) # plt.show() # if(ty != None and tx != None): # input(ty) # writer.add_scalar('Reward', ty[-1], tx[-1]) # if(tx != None and ty != None): # plt.cla() # plt.plot(tx, ty) # plt.pause(0.1) except IOError: pass
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # get cloned policy and recovered reward function policy_reward_dir = args.rewards_dir policy_dir = args.policies_dir policy_reward = Policy(envs.observation_space.shape, envs.action_space) policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth' policy_reward_sd = torch.load(policy_reward_file_name) policy_reward.load_state_dict(policy_reward_sd) actor_critic = Policy(envs.observation_space.shape, envs.action_space) policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth' policy_sd = torch.load(policy_file_name) actor_critic.load_state_dict(policy_sd) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, _, done, infos = envs.step(action) if step > 1 and step % 1000 == 0: done = True # use infered reward: with torch.no_grad(): # _, reward = shapes(rollouts.obs[step], 0) _, action_log_probs, _, _ = policy_reward.evaluate_actions( rollouts.obs[step], None, None, action) reward = action_log_probs for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) r = 0 for key, val in info.items(): if 'reward' in key: r += val episode_rewards.append(r) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act( obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") name = "compiled_dataset_08131950" #add 50 back in embed_dim = 300 # switch this later!! embed_size = embed_dim with open('data/' + name + '_all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, vocabulary=vocab) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) #print(args.num_env_steps) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes #print(num_updates) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.model_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): env = make_vec_envs(args.env_name, args.seed + 101, 1, None, None, device, False, vocabulary=vocab) recurrent_hidden_states = torch.zeros( 1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) obs = env.reset() count = {} for i in range(100): tot_steps = obs[0, 0].item() for step in range(98): with torch.no_grad(): value, action, _, recurrent_hidden_states = actor_critic.act( obs, recurrent_hidden_states, masks, True) # Obser reward and next obs obs, reward, done, _ = env.step(action) if done: if tot_steps in count: count[tot_steps][0] = count[tot_steps][0] + 1 count[tot_steps][1] = count[tot_steps][1] + 1 else: count[tot_steps] = [1, 1] break if not done: obs = env.reset() if tot_steps in count: count[tot_steps][0] = count[tot_steps][0] + 0 count[tot_steps][1] = count[tot_steps][1] + 1 else: count[tot_steps] = [0, 1] #f=open(os.path.join(save_path, args.model_name) + ".txt", "a+") filename = os.path.join(save_path, args.model_name) + ".txt" if os.path.exists(filename): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not f = open(filename, append_write) f.write(str(j) + "\n") f.write(str(count) + "\n") f.close()
OUTER_BATCHSIZE = 10000 INNER_BATCHSIZE = 10000 NUM_PROCESS = 1 torch.set_num_threads(NUM_PROCESS) set_seed(SEED) device = torch.device("cuda:0" if CUDA else "cpu") logdir = "./GD_STORM_LVC/%s/batchsize%d_innersize%d_seed%d_lrcritic%f_lractorinit%f_freq_%d" % ( str(ENV_NAME), OUTER_BATCHSIZE, INNER_BATCHSIZE, SEED, CRITIC_LR, ACTOR_LR, NUM_INNER) writer = SummaryWriter(log_dir=logdir) envs = make_vec_envs(env_name=ENV_NAME, seed=SEED, num_processes=NUM_PROCESS, gamma=GAMMA, log_dir='./env_log/', device=device, allow_early_resets=True) actor = Policy(num_inputs=envs.observation_space.shape[0], num_outputs=envs.action_space.shape[0], hidden_size=64) critic = Value(num_inputs=envs.observation_space.shape[0], hidden_size=64) actor.to(device) critic.to(device) agent = STORM_LVC(actor=actor, critic=critic, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, alpha_initial=1)
def main( _run, _log, num_env_steps, env_name, seed, algorithm, dummy_vecenv, time_limit, wrappers, save_dir, eval_dir, loss_dir, log_interval, save_interval, eval_interval, ): if loss_dir: loss_dir = path.expanduser(loss_dir.format(id=str(_run._id))) utils.cleanup_log_dir(loss_dir) writer = SummaryWriter(loss_dir) else: writer = None eval_dir = path.expanduser(eval_dir.format(id=str(_run._id))) save_dir = path.expanduser(save_dir.format(id=str(_run._id))) utils.cleanup_log_dir(eval_dir) utils.cleanup_log_dir(save_dir) torch.set_num_threads(1) envs = make_vec_envs( env_name, seed, dummy_vecenv, algorithm["num_processes"], time_limit, wrappers, algorithm["device"], ) agents = [ A2C(i, osp, asp) for i, (osp, asp) in enumerate(zip(envs.observation_space, envs.action_space)) ] obs = envs.reset() for i in range(len(obs)): agents[i].storage.obs[0].copy_(obs[i]) agents[i].storage.to(algorithm["device"]) start = time.time() num_updates = ( int(num_env_steps) // algorithm["num_steps"] // algorithm["num_processes"] ) all_infos = deque(maxlen=10) for j in range(1, num_updates + 1): for step in range(algorithm["num_steps"]): # Sample actions with torch.no_grad(): n_value, n_action, n_action_log_prob, n_recurrent_hidden_states = zip( *[ agent.model.act( agent.storage.obs[step], agent.storage.recurrent_hidden_states[step], agent.storage.masks[step], ) for agent in agents ] ) # Obser reward and next obs obs, reward, done, infos = envs.step(n_action) # envs.envs[0].render() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [ [0.0] if info.get("TimeLimit.truncated", False) else [1.0] for info in infos ] ) for i in range(len(agents)): agents[i].storage.insert( obs[i], n_recurrent_hidden_states[i], n_action[i], n_action_log_prob[i], n_value[i], reward[:, i].unsqueeze(1), masks, bad_masks, ) for info in infos: if info: all_infos.append(info) # value_loss, action_loss, dist_entropy = agent.update(rollouts) for agent in agents: agent.compute_returns() for agent in agents: loss = agent.update([a.storage for a in agents]) for k, v in loss.items(): if writer: writer.add_scalar(f"agent{agent.agent_id}/{k}", v, j) for agent in agents: agent.storage.after_update() if j % log_interval == 0 and len(all_infos) > 1: squashed = _squash_info(all_infos) total_num_steps = ( (j + 1) * algorithm["num_processes"] * algorithm["num_steps"] ) end = time.time() _log.info( f"Updates {j}, num timesteps {total_num_steps}, FPS {int(total_num_steps / (end - start))}" ) _log.info( f"Last {len(all_infos)} training episodes mean reward {squashed['episode_reward'].sum():.3f}" ) for k, v in squashed.items(): _run.log_scalar(k, v, j) all_infos.clear() if save_interval is not None and ( j > 0 and j % save_interval == 0 or j == num_updates ): cur_save_dir = path.join(save_dir, f"u{j}") for agent in agents: save_at = path.join(cur_save_dir, f"agent{agent.agent_id}") os.makedirs(save_at, exist_ok=True) agent.save(save_at) archive_name = shutil.make_archive(cur_save_dir, "xztar", save_dir, f"u{j}") shutil.rmtree(cur_save_dir) _run.add_artifact(archive_name) if eval_interval is not None and ( j > 0 and j % eval_interval == 0 or j == num_updates ): evaluate( agents, os.path.join(eval_dir, f"u{j}"), ) videos = glob.glob(os.path.join(eval_dir, f"u{j}") + "/*.mp4") for i, v in enumerate(videos): _run.add_artifact(v, f"u{j}.{i}.mp4") envs.close()
parser.add_argument('--log-interval', type=int, default=10, help='log interval, one log per n updates (default: 10)') parser.add_argument('--env-name', default='PongNoFrameskip-v4', help='environment to train on (default: PongNoFrameskip-v4)') parser.add_argument('--load-dir', default='./trained_models/', help='directory to save agent logs (default: ./trained_models/)') parser.add_argument('--add-timestep', action='store_true', default=False, help='add timestep to observations') parser.add_argument('--non-det', action='store_true', default=False, help='whether to use a non-deterministic policy') args = parser.parse_args() args.det = not args.non_det env = make_vec_envs(args.env_name, args.seed + 1000, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) # Get a render function render_func = get_render_func(env) # We need to use the same statistics for normalization as used in training actor_critic, ob_rms = \ torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) vec_norm = get_vec_normalize(env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size)
def evaluate( agents, monitor_dir, episodes_per_eval, env_name, seed, wrappers, dummy_vecenv, time_limit, algorithm, _log, ): device = algorithm["device"] eval_envs = make_vec_envs( env_name, seed, dummy_vecenv, episodes_per_eval, time_limit, wrappers, device, monitor_dir=monitor_dir, ) n_obs = eval_envs.reset() n_recurrent_hidden_states = [ torch.zeros( episodes_per_eval, agent.model.recurrent_hidden_state_size, device=device ) for agent in agents ] masks = torch.zeros(episodes_per_eval, 1, device=device) all_infos = [] while len(all_infos) < episodes_per_eval: with torch.no_grad(): _, n_action, _, n_recurrent_hidden_states = zip( *[ agent.model.act( n_obs[agent.agent_id], recurrent_hidden_states, masks ) for agent, recurrent_hidden_states in zip( agents, n_recurrent_hidden_states ) ] ) # Obser reward and next obs n_obs, _, done, infos = eval_envs.step(n_action) n_masks = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device, ) all_infos.extend([i for i in infos if i]) eval_envs.close() info = _squash_info(all_infos) _log.info( f"Evaluation using {len(all_infos)} episodes: mean reward {info['episode_reward']:.5f}\n" )