def init_policies(observation_space, action_space, base_kwargs, num_agents, base): actor_critics = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs) for _ in range(num_agents) ] shared_cpu_actor_critics = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs).share_memory() for _ in range(num_agents) ] shared_cpu_actor_critics_env_actor = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs).share_memory() for _ in range(num_agents) ] pytorch_total_params = sum(p.numel() for p in actor_critics[0].parameters() if p.requires_grad) print('number of params ', pytorch_total_params) return actor_critics, shared_cpu_actor_critics, shared_cpu_actor_critics_env_actor
def render_growspace_with_ddpg(): seed = 123 num_processes = 1 gamma = 0.99 log_dir = "." custom_gym = "growspace" recurrent_policy = False cuda = True device = torch.device("cuda:0" if cuda else "cpu") envs = make_vec_envs("GrowSpaceEnv-Continuous-v0", seed, num_processes, gamma, log_dir, device, False, custom_gym) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': recurrent_policy}) actor_critic.to(device) eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) obs = envs.reset() while True: with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) obs, rewards, dones, info = envs.step(action) envs.render()
def __init__(self, new_constants={}): # Update constants assert(type(new_constants) == dict) for c in new_constants.keys(): if c in self.constants.keys(): self.constants[c] = new_constants[c] # Clear existing logs if self.constants["clear_logs"]: for fname in glob.glob("*_log_*.txt"): os.remove(fname) print('Removed: %s' % fname) # set device self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(self.constants["seed"]) torch.cuda.manual_seed_all(self.constants["seed"]) # construct envs args = self.args_k("seed", "num_processes", "gamma", "log_dir") assert(self.constants["env"] != None) if type(self.constants["env"]) == str: # Gym env self.envs = make_vec_envs(self.constants["env"], *args, self.device, False) else: # Custom env (TODO: multi-process?) self.envs = make_vec_envs_custom(self.constants, self.device, self.constants["env"]) # construct actor critic env_args = (self.envs.observation_space.shape, self.envs.action_space) self.actor_critic = Policy(*env_args, base_kwargs={'recurrent': self.constants["recurrent_policy"]}).to(self.device) # construct PPO args = self.args_k("clip_param", "ppo_epoch", "minibatch_size", "value_loss_coef", "entropy_coef") kwargs = self.kwargs_k("lr", "eps", "max_grad_norm") self.agent = algo.PPO(self.actor_critic, *args, **kwargs) # rollout storage / experiences args = self.args_k("forward_steps", "num_processes") self.rollouts = RolloutStorage(*args, *env_args, self.actor_critic.recurrent_hidden_state_size)
def initialize_warm_up_batch(args, device): # using evenly distributed weights for warm-up stage weights_batch = [] generate_weights_batch_dfs(0, args.obj_num, args.min_weight, args.max_weight, args.delta_weight, [], weights_batch) sample_batch = [] scalarization_batch = [] temp_env = gym.make( args.env_name) # temp_env is only used for initialization for weights in weights_batch: actor_critic = Policy(temp_env.observation_space.shape, temp_env.action_space, base_kwargs={'layernorm': args.layernorm}, obj_num=args.obj_num) actor_critic.to(device).double() if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=1e-5, max_grad_norm=args.max_grad_norm) else: # NOTE: other algorithms are not supported yet raise NotImplementedError envs = make_vec_envs(env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, \ gamma=args.gamma, log_dir=None, device=device, allow_early_resets=False, \ obj_rms = args.obj_rms, ob_rms = args.ob_rms) env_params = {} env_params['ob_rms'] = deepcopy( envs.ob_rms) if envs.ob_rms is not None else None env_params['ret_rms'] = deepcopy( envs.ret_rms) if envs.ret_rms is not None else None env_params['obj_rms'] = deepcopy( envs.obj_rms) if envs.obj_rms is not None else None envs.close() scalarization = WeightedSumScalarization(num_objs=args.obj_num, weights=weights) sample = Sample(env_params, actor_critic, agent, optgraph_id=-1) objs = evaluation(args, sample) sample.objs = objs sample_batch.append(sample) scalarization_batch.append(scalarization) temp_env.close() return sample_batch, scalarization_batch
def ppo_experiment(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda:0" if args.cuda else "cpu") if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True env_kwargs = dict() env_kwargs['timestep'] = args.timestep if "push" in args.env_name: env_kwargs['params'] = 'random_goal_unconstrained' if "soccer" in args.env_name: env_kwargs['params'] = 'random_goal_unconstrained' if "faucet" in args.env_name: secondary_output = True envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs) test_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, None, args.log_dir, device, False, env_kwargs=env_kwargs) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) train_ppo(actor_critic, agent, rollouts, envs, test_envs, args)
def main(): args = extend_arguments(get_parser()).parse_args() configs = common.config.get_config(args.env, args.experiment_name) assert args.alg in ['a2c', 'ppo', 'acktr', 'sac'] if args.recurrent_policy: assert args.alg in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' if args.test: args.num_processes = 1 args.use_wandb = False logger = setup_logger(args.verbose, args.model_name, configs.log_directory) torch.set_num_threads(1) # set seed values seed = args.seed torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if args.use_wandb: import wandb resume_wandb = True if args.wandb_resume_id is not None else False wandb.init(config=args, resume=resume_wandb, id=args.wandb_resume_id, project='rl', name=args.experiment_name) # make environements (envs[0] is used for evaluation) envs, env_vector = make_vec_envs_pytorch(args.env, return_evn_vector=True, device=device, log_dir=configs.log_directory, **vars(args)) eval_envs = wrap_env_pytorch(env_vector[0], args.gamma, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_layer_size }) # load model if args.load_path is not None: logger.info("loading model: {}".format(args.load_path)) actor_critic = torch.load(args.load_path) actor_critic.to(device) if args.test: test(eval_envs, actor_critic, args, logger) else: train(envs, env_vector, eval_envs, actor_critic, args, configs, logger)
def load_alg(): actor_critic_2, ob_rms = \ torch.load(os.path.join(model_path)) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy }).to(device) custom_loader(actor_critic, actor_critic_2) del actor_critic_2 agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) return actor_critic, ob_rms, agent, rollouts
def initialize_policy(envs): actor_critic = Policy(envs.obs_shape, envs.action_space, base_kwargs=dict(recurrent=args.recurrent_policy, hidden_size=args.policy_hidden_size, init_gain=args.init_gain)) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) return actor_critic, agent
def get_agent(args, envs, encoder, device): actor_critic = Policy( [encoder.feature_size], envs.action_space, base=SimpleBase, base_kwargs={"encoder": encoder} ) actor_critic.to(device) agent = algo.PPO( actor_critic, args.ppo_clip_param, args.ppo_epoch, args.ppo_num_mini_batch, args.ppo_value_loss_coef, args.ppo_entropy_coef, lr=args.ppo_lr, eps=args.ppo_eps, max_grad_norm=args.ppo_max_grad_norm, ) return agent, actor_critic
def job(rank, args, device, shared_model): episode_rewards = deque(maxlen=10) envs = gym.make(args.env_name) envs.seed(args.seed + rank) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) actor_critic.load_state_dict(shared_model.state_dict()) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() obs = torch.from_numpy(obs) rollouts.obs[0].copy_(obs) rollouts.to(device) acc_r = 0 done = [False] for step in range(args.num_steps): if done[0]: episode_rewards.append(acc_r) obs = envs.reset() obs = torch.from_numpy(obs) rollouts.obs[0].copy_(obs) rollouts.to(device) acc_r = 0 # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs target_action = action.numpy()[0] obs, reward, done, infos = envs.step(target_action) acc_r += reward obs = torch.from_numpy(obs).float().to(device) reward = torch.from_numpy(np.array([reward])).unsqueeze(dim=1).float() done = [done] masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor([[1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) print(rank, np.mean(episode_rewards)) return rollouts
def main(): args = get_args() torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) if config.cuda and torch.cuda.is_available() and config.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg, 'train', seed=config.seed) eval_log_dir = final_output_dir + "_eval" utils.cleanup_log_dir(final_output_dir) utils.cleanup_log_dir(eval_log_dir) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) writer = SummaryWriter(tb_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + config.GPUS if config.cuda else "cpu") width = height = 84 envs = make_vec_envs(config.env_name, config.seed, config.num_processes, config.gamma, final_output_dir, device, False, width=width, height=height, ram_wrapper=False) # create agent actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': config.recurrent_policy, 'hidden_size': config.hidden_size, 'feat_from_selfsup_attention': config.feat_from_selfsup_attention, 'feat_add_selfsup_attention': config.feat_add_selfsup_attention, 'feat_mul_selfsup_attention_mask': config.feat_mul_selfsup_attention_mask, 'selfsup_attention_num_keypoints': config.SELFSUP_ATTENTION.NUM_KEYPOINTS, 'selfsup_attention_gauss_std': config.SELFSUP_ATTENTION.GAUSS_STD, 'selfsup_attention_fix': config.selfsup_attention_fix, 'selfsup_attention_fix_keypointer': config.selfsup_attention_fix_keypointer, 'selfsup_attention_pretrain': config.selfsup_attention_pretrain, 'selfsup_attention_keyp_maps_pool': config.selfsup_attention_keyp_maps_pool, 'selfsup_attention_image_feat_only': config.selfsup_attention_image_feat_only, 'selfsup_attention_feat_masked': config.selfsup_attention_feat_masked, 'selfsup_attention_feat_masked_residual': config.selfsup_attention_feat_masked_residual, 'selfsup_attention_feat_load_pretrained': config.selfsup_attention_feat_load_pretrained, 'use_layer_norm': config.use_layer_norm, 'selfsup_attention_keyp_cls_agnostic': config.SELFSUP_ATTENTION.KEYPOINTER_CLS_AGNOSTIC, 'selfsup_attention_feat_use_ln': config.SELFSUP_ATTENTION.USE_LAYER_NORM, 'selfsup_attention_use_instance_norm': config.SELFSUP_ATTENTION.USE_INSTANCE_NORM, 'feat_mul_selfsup_attention_mask_residual': config.feat_mul_selfsup_attention_mask_residual, 'bottom_up_form_objects': config.bottom_up_form_objects, 'bottom_up_form_num_of_objects': config.bottom_up_form_num_of_objects, 'gaussian_std': config.gaussian_std, 'train_selfsup_attention': config.train_selfsup_attention, 'block_selfsup_attention_grad': config.block_selfsup_attention_grad, 'sep_bg_fg_feat': config.sep_bg_fg_feat, 'mask_threshold': config.mask_threshold, 'fix_feature': config.fix_feature }) # init / load parameter if config.MODEL_FILE: logger.info('=> loading model from {}'.format(config.MODEL_FILE)) state_dict = torch.load(config.MODEL_FILE) state_dict = OrderedDict( (_k, _v) for _k, _v in state_dict.items() if 'dist' not in _k) actor_critic.load_state_dict(state_dict, strict=False) elif config.RESUME: checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) actor_critic.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) actor_critic.to(device) if config.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, train_selfsup_attention=config.train_selfsup_attention) elif config.algo == 'ppo': agent = algo.PPO(actor_critic, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) elif config.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, acktr=True, train_selfsup_attention=config.train_selfsup_attention, max_grad_norm=config.max_grad_norm) # rollouts: environment rollouts = RolloutStorage( config.num_steps, config.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, keep_buffer=config.train_selfsup_attention, buffer_size=config.train_selfsup_attention_buffer_size) if config.RESUME: if os.path.exists(checkpoint_file): agent.optimizer.load_state_dict(checkpoint['optimizer']) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( config.num_env_steps) // config.num_steps // config.num_processes best_perf = 0.0 best_model = False print('num updates', num_updates, 'num steps', config.num_steps) for j in range(num_updates): if config.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if config.algo == "acktr" else config.lr) for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) recurrent_hidden_states, meta = recurrent_hidden_states # Obser reward and next obs obs, reward, done, infos = envs.step(action) objects_locs = [] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) if objects_locs: objects_locs = torch.FloatTensor(objects_locs) objects_locs = objects_locs * 2 - 1 # -1, 1 else: objects_locs = None rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, objects_loc=objects_locs) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.gae_lambda, config.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if config.train_selfsup_attention and j > 15: for _iter in range(config.num_steps // 5): frame_x, frame_y = rollouts.generate_pair_image() selfsup_attention_loss, selfsup_attention_output, image_b_keypoints_maps = \ agent.update_selfsup_attention(frame_x, frame_y, config.SELFSUP_ATTENTION) if j % config.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * config.num_processes * config.num_steps end = time.time() msg = 'Updates {}, num timesteps {}, FPS {} \n' \ 'Last {} training episodes: mean/median reward {:.1f}/{:.1f} ' \ 'min/max reward {:.1f}/{:.1f} ' \ 'dist entropy {:.1f}, value loss {:.1f}, action loss {:.1f}\n'. \ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) if config.train_selfsup_attention and j > 15: msg = msg + 'selfsup attention loss {:.5f}\n'.format( selfsup_attention_loss) logger.info(msg) if (config.eval_interval is not None and len(episode_rewards) > 1 and j % config.eval_interval == 0): total_num_steps = (j + 1) * config.num_processes * config.num_steps ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) eval_mean_score, eval_max_score, eval_scores = evaluate( actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, width=width, height=height) perf_indicator = eval_mean_score if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False # record test scores with open(os.path.join(final_output_dir, 'test_scores'), 'a+') as f: out_s = "TEST: {}, {}, {}, {}\n".format( str(total_num_steps), str(eval_mean_score), str(eval_max_score), [str(_eval_scores) for _eval_scores in eval_scores]) print(out_s, end="", file=f) logger.info(out_s) writer.add_scalar('data/mean_score', eval_mean_score, total_num_steps) writer.add_scalar('data/max_score', eval_max_score, total_num_steps) writer.add_scalars('test', {'mean_score': eval_mean_score}, total_num_steps) # save for every interval-th episode or for the last epoch if (j % config.save_interval == 0 or j == num_updates - 1) and config.save_dir != "": logger.info( "=> saving checkpoint to {}".format(final_output_dir)) epoch = j / config.save_interval save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': actor_critic.state_dict(), 'perf': perf_indicator, 'optimizer': agent.optimizer.state_dict(), 'ob_rms': getattr(utils.get_vec_normalize(envs), 'ob_rms', None) }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(actor_critic.state_dict(), final_model_state_file) # export_scalars_to_json needs results from add scalars writer.export_scalars_to_json(os.path.join(tb_log_dir, 'all_scalars.json')) writer.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") actor_critic = Policy(STATE_DIM, ACTION_DIM, USER_DIM) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.cgail: discr = cgail.Discriminator(STATE_DIM, ACTION_DIM, USER_DIM, device, lr=args.D_lr) train_file_name = os.path.join(args.experts_dir, "expert_traj.pkl") test_file_name = os.path.join(args.experts_dir, "test_traj.pkl") ground_file_name = os.path.join(args.experts_dir, "exp_loc.pkl") expert_st, expert_ur, expert_ac = pickle.load(open(train_file_name, 'rb')) train_load = data_utils.TensorDataset( torch.from_numpy(np.asarray(expert_st)), torch.from_numpy(np.asarray(expert_ur)), torch.from_numpy(np.asarray(expert_ac))) gail_train_loader = torch.utils.data.DataLoader( train_load, batch_size=args.gail_batch_size, shuffle=True) test_st, test_ur, test_ac = pickle.load(open(test_file_name, 'rb')) test_load = data_utils.TensorDataset(torch.from_numpy(np.asarray(test_st)), torch.from_numpy(np.asarray(test_ur)), torch.from_numpy(np.asarray(test_ac))) test_loader = torch.utils.data.DataLoader(test_load, batch_size=args.gail_batch_size, shuffle=True) exp_loc = pickle.load(open(ground_file_name, 'rb')) envs = make_vec_envs(expert_st, expert_ur, args.seed, args.num_processes, args.gamma, device) rollouts = RolloutStorage(args.num_steps, args.num_processes, STATE_DIM * 5, USER_DIM, ACTION_DIM) obs, user = envs.reset() rollouts.obs[0].copy_(obs[0]) rollouts.user[0].copy_(user[0]) rollouts.to(device) result_log = [] start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.user[step]) # Obser reward and next obs if action.item() != 9: obs = decide_next_state(action, rollouts.obs[step][0], 1) if obs != None: rollouts.insert(obs, rollouts.user[step], action, action_log_prob, value) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.user[-1]).detach() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts) for step in range(args.num_steps): if cgail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.user[step], rollouts.actions[step], args.gamma) else: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, str(args.lr), str(args.gail_batch_size), "entropy_" + str(args.entropy_coef), "D_lr" + str(args.D_lr)) try: os.makedirs(save_path) except OSError: pass torch.save(actor_critic, os.path.join(save_path, "ac_{}.pt".format(j))) torch.save(discr, os.path.join(save_path, "D_{}.pt".format(j))) if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) out_loc = {} for i, data in enumerate(test_loader, 0): inputs, user, labels = data inputs = inputs.float() user = user.float() labels = labels.long() output = actor_critic.act(inputs, user)[1].tolist() for i in range(inputs.size(0)): x = int(inputs[i][0].item()) y = int(inputs[i][1].item()) if (x, y) not in out_loc: out_loc[(x, y)] = np.zeros(10) out_loc[(x, y)][output[i]] += 1 else: out_loc[(x, y)][output[i]] += 1 target = [] ground = [] for key in out_loc: o1 = out_loc[key].copy() o1 /= sum(o1) if key in exp_loc: o2 = np.zeros(10) for b, w in exp_loc[key].items(): o2[b] += w o2 /= sum(o2) target.append(o1) ground.append(o2) k, kls = cross_entropy(target, ground) print(k)
def __init__(self, learner, envs, maxat, maxupd, targ_policy, args, device="cpu", hidden_sizes=(64, 64), activation=nn.Tanh, rand_select=False): super(TargAttacker, self).__init__() self.args = args self.targ_policy = targ_policy self.learner = learner self.gamma = args.gamma self.device = device self.radius = args.radius self.frac = args.frac self.stepsize = args.stepsize self.maxiter = args.maxiter self.maxat = maxat self.maxupd = maxupd self.delta = args.delta self.dist_thres = args.dist_thres self.rand_select = rand_select self.disc_action = isinstance(envs.action_space, Discrete) if self.disc_action: self.action_dim = envs.action_space.n attack_policy = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) attack_policy.to(device) if isinstance(learner, algo.A2C_ACKTR): self.im_learner = algo.A2C_ACKTR(attack_policy, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=learner.acktr) elif isinstance(learner, algo.PPO): self.im_learner = algo.PPO(attack_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) self.cp_net() self.critic = Value(envs.observation_space.shape[0], hidden_sizes, activation).to(device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=args.lr) self.dist_list = np.array([]) self.attack_num = 0 self.update_num = 0 self.state_buffer = None self.state_buffer_limit = 100
def main(repeat_num): args = get_args() print("start the train function") args.init_sigma = 0.6 args.lr = 0.001 device = torch.device("cpu") # Init the environment # env_name = "Safexp-PointGoal1-v0" eval_envs = make_vec_envs(env_name, np.random.randint(2**32), 1, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) obs_shape = eval_envs.observation_space.shape actor_critic_policy = init_default_ppo(eval_envs, log(args.init_sigma)) # Prepare modified action space for instinct inst_action_space = deepcopy(eval_envs.action_space) inst_obs_shape = list(obs_shape) inst_obs_shape[0] = inst_obs_shape[0] + eval_envs.action_space.shape[0] inst_action_space.shape = list(inst_action_space.shape) inst_action_space.shape[0] = inst_action_space.shape[0] + 1 inst_action_space.shape = tuple(inst_action_space.shape) actor_critic_instinct = Policy(tuple(inst_obs_shape), inst_action_space, init_log_std=log(args.init_sigma), base_kwargs={'recurrent': False}) title = "baseline_pretrained_hh_10" # f = open(f"/Users/djgr/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BUTTON_more_space/{title}.csv", "w") actor_critic_policy = torch.load( # f"/Users/djgr/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space_more_time/hh_10_baseline_centered_noHaz/model_rl_policy_latest.pt" "/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/model_rl_policy_latest.pt" # "/home/calavera/code/ITU_work/IR2L_master/pretrained_policy.pt" ) actor_critic_instinct = torch.load( f"/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/model_rl_instinct_latest.pt" ) ob_rms = utils.get_vec_normalize(eval_envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms ob_rms = pickle.load( open( f"/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/ob_rms.p", "rb")) for _ in range(repeat_num): fits, info = evaluate( # EvalActorCritic(actor_critic_policy, actor_critic_instinct, det_policy=True, det_instinct=True), EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, 1, reward_cost_combinator, device, instinct_on=True, visualise=True) visualise_values_over_path(info['plot_info']) # f.write(f"fitness; {fits.item()}; hazard_collisions; {info['hazard_collisions']}\n") # f.flush() print(f"{info['hazard_collisions']}") print( f"fitness; {fits.item()}; hazard_collisions; {info['hazard_collisions']}\n" )
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") #envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, device, False) envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True) ''' actor_critic = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) ''' actor_critic = [] for i in range(args.agent_num): ac = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, agent_i = i, base_kwargs={'recurrent': args.recurrent_policy}) ac.to(device) actor_critic.append(ac) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': ''' agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) ''' agent = [] for i in range(args.agent_num): agent.append(algo.PPO( actor_critic[i], i, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, model_dir = args.model_dir)) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ''' rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs[:,0,:])) rollouts.to(device) ''' rollouts = [] for i in range(args.agent_num): rollout = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic[i].recurrent_hidden_state_size, args.agent_num, i) rollouts.append(rollout) obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) for j in range(num_updates): #pdb.set_trace() if args.use_linear_lr_decay: # decrease learning rate linearly for i in range(args.agent_num): utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], [] with torch.no_grad(): for i in range(args.agent_num): #pdb.set_trace() value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act( rollouts[i].share_obs[step], rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step], rollouts[i].masks[step]) # import pdb; pdb.set_trace() value_list.append(value) action_list.append(action) action_log_prob_list.append(action_log_prob) recurrent_hidden_states_list.append(recurrent_hidden_states) # Obser reward and next obs action = [] for i in range(args.num_processes): one_env_action = [] for k in range(args.agent_num): one_hot_action = np.zeros(envs.action_space[0].n) one_hot_action[action_list[k][i]] = 1 one_env_action.append(one_hot_action) action.append(one_env_action) #start = time.time() #pdb.set_trace() obs, reward, done, infos = envs.step(action) # print(obs[0][0]) # pdb.set_trace() #end = time.time() #print("step time: ", end-start) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. ''' masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done[0]]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos[0]]) ''' masks = torch.ones(args.num_processes, 1) bad_masks = torch.ones(args.num_processes, 1) ''' rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) ''' #import pdb; pdb.set_trace() for i in range(args.agent_num): rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), recurrent_hidden_states, action_list[i], action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks) #import pdb; pdb.set_trace() with torch.no_grad(): next_value_list = [] for i in range(args.agent_num): next_value = actor_critic[i].get_value( rollouts[i].share_obs[-1], rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1], rollouts[i].masks[-1]).detach() next_value_list.append(next_value) if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) for i in range(args.agent_num): rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #import pdb; pdb.set_trace() for i in range(args.agent_num): value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i]) if (i == 0): print("value loss: " + str(value_loss)) # print(value_loss) # pdb.set_trace() #rollouts.after_update() obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) # save for every interval-th episode or for the last epoch #pdb.set_trace() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) if not os.path.exists(save_path + args.model_dir): os.makedirs(save_path + args.model_dir) for i in range(args.agent_num): torch.save([ actor_critic[i], getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt") ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ''' '''
def main(): tb_path = os.path.join(os.path.expanduser(args.log_dir), "tensorboard_log") makedir_if_not_exists(tb_path) writer = SummaryWriter(tb_path) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # p = multiprocessing.Process(target=_tb_task,args=(tb_path,5013) ,daemon=True) # p.start() if args.start_tb: _tb_task(tb_path, port=5013) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_eps = 0 # num training eps num_steps = 0 # num training eps for j in range(num_updates): # list of all values all eps in num updates num_steps_basline_info = defaultdict(list) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) env_basline_info = defaultdict(list) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # episode is done # add addisiotnal baseline rw info in infos: if 'basline_rw_mse' in info: env_basline_info['rw_mse'].append(info['basline_rw_mse']) env_basline_info['rw_rec'].append(info['basline_rw_rec']) if 'basline_rw_tcn' in info: env_basline_info['rw_tcn'].append(info['basline_rw_tcn']) if 'episode' in info.keys(): # end of episode episode_rewards.append(info['episode']['r']) num_steps_basline_info['len_episode'].append( info['episode']['l']) # distance of the pushed block num_steps_basline_info['push_distance'].append( info['basline_rw_push_dist']) # take mean over eps for k, step_vals in env_basline_info.items(): num_steps_basline_info[k].append(np.sum(step_vals)) # add baseline infos num_eps += 1 env_basline_info = defaultdict(list) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps # write baseline finfos for tcn writer_step = total_num_steps for k, vals_step_eps in num_steps_basline_info.items(): writer.add_scalar('basline/' + k, np.mean(vals_step_eps), writer_step) writer.add_scalar('basline/episodes', num_eps, writer_step) len_eps = np.mean(num_steps_basline_info['len_episode']) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log.info( "Updates {}, num timesteps {}, FPS {} Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, len eps {}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), int(len_eps), dist_entropy, value_loss, action_loss)) if j == num_updates or (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): vid_log_dir = os.getenv('TCN_ENV_VID_LOG_FOLDER', '/tmp/env_tcn/train_vid') vid_log_inter = os.getenv('TCN_ENV_VID_LOG_INTERVAL', train_vid_log_iter) os.environ[ 'TCN_ENV_VID_LOG_FOLDER'] = "eval_vid" # os.path.join(vid_log_dir,"../eval_vid/","interval_"+str(j)) os.environ['TCN_ENV_VID_LOG_INTERVAL'] = '1' os.environ['TCN_ENV_EVAL_EPISODE'] = '1' with redirect_stdout(open(os.devnull, "w")): # no stdout with suppress_logging(): # eval envs eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, 1, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 1: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append( info['episode']['r']) eval_envs.close() os.environ['TCN_ENV_VID_LOG_FOLDER'] = vid_log_dir os.environ['TCN_ENV_EVAL_EPISODE'] = '0' os.environ['TCN_ENV_VID_LOG_INTERVAL'] = vid_log_inter writer.add_scalar('eval/rw', np.mean(eval_episode_rewards), j) log.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if j % args.vis_interval == 0: try: td_plot(writer, args.log_dir) # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) except IOError: print("plt error") pass
def main(): ARGUMENTS.update(vars(args)) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ALL_UPDATES.append(j) ALL_TIMESTEPS.append(total_num_steps) ALL_FPS.append(int(total_num_steps / (end - start))) ALL_MEAN_REWARDS.append(np.mean(episode_rewards)) ALL_MEDIAN_REWARDS.append(np.median(episode_rewards)) ALL_MIN_REWARDS.append(np.min(episode_rewards)) ALL_MAX_REWARDS.append(np.max(episode_rewards)) ALL_DIST_ENTROPY.append(dist_entropy) ALL_VALUE_LOSS.append(value_loss) ALL_ACTION_LOSS.append(action_loss) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass # Save the results name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise']) experiment = ro.Experiment(name, directory='results') data = { 'updates': ALL_UPDATES, 'timesteps': ALL_TIMESTEPS, 'fps': ALL_FPS, 'mean_rewards': ALL_MEAN_REWARDS, 'median_rewards': ALL_MEDIAN_REWARDS, 'min_rewards': ALL_MIN_REWARDS, 'max_rewards': ALL_MAX_REWARDS, 'dist_entropy': ALL_DIST_ENTROPY, 'value_loss': ALL_VALUE_LOSS, 'action_loss': ALL_ACTION_LOSS, } data.update(ARGUMENTS) result = data['mean_rewards'][-1] experiment.add_result(result, data)
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def actor(actor_rank, action_logits, values, observations, step_dones, act_in_progs, done_list, shared_cpu_actor_critics_env_actor, device, observation_space, action_space, please_load_model_actor, args, actor_lock): """ Actor grabs observations from the observation buffer and perform forwarding. Then the actor sends the action logits and values to the action and values buffers. Args: actor_rank: actor's id. action_logits: A shared PyTorch tensor served as an action buffer. values: A shared PyTorch tensor served as a value buffer. observations: A shared PyTorch tensor served as an observation buffer. step_dones: A shared list to indicate environment processes finish one environment step. act_in_progs: A shared array to indicate the observation is being processed by an actor. done_list: A shared list that indicates if environment processes finish all steps. shared_cpu_actor_critics_env_actor: Shared models between actor and environment processes. Actor processes will load models from environment process 0. device: CPU/GPU device. observation_space: The OpenAI gym observation space of the environment. action_space: The OpenAI gym action space of the environment. please_load_model_actor: A shared array between actors and the environment process 0. When updated model is available. It is set to one. Once an actor finished loading the updated model, it sets its slots to zero. args: command line argument. actor_lock: A lock to prevent actors from grabbing data which is already being processed by other actors. Returns: None """ if args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.manual_seed(args.seed) base_kwargs = {'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size} act_in_progs_np = np.frombuffer(act_in_progs.get_obj(), dtype=np.int32) step_dones_np = np.frombuffer(step_dones.get_obj(), dtype=np.int32) step_dones_np = step_dones_np.reshape(args.num_processes) act_in_progs_np = act_in_progs_np.reshape(args.num_processes) models = [Policy( observation_space.shape[1:], action_space if args.num_agents == 1 else Discrete( action_space.nvec[0]), base=get_base(args.base), base_kwargs=base_kwargs).to(device) for _ in range(args.num_agents)] for agent_idx in range(args.num_agents): stat_dict = shared_cpu_actor_critics_env_actor[agent_idx].state_dict() models[agent_idx].load_state_dict(stat_dict) steps = 0 while False in done_list: # polling if please_load_model_actor[actor_rank] == 1: for agent_idx in range(args.num_agents): stat_dict = shared_cpu_actor_critics_env_actor[agent_idx].state_dict( ) models[agent_idx].load_state_dict(stat_dict) please_load_model_actor[actor_rank] = 0 if args.cuda_deterministic: with actor_lock: step_done_not_prog_np = np.logical_and( step_dones_np, act_in_progs_np == 0) ranks = np.where(step_done_not_prog_np == 1)[0] act_in_progs_np[ranks] = 1 if ranks.size > 0: steps += 1 for agent_idx in range(args.num_agents): for env_rank in ranks: # new code with torch.no_grad(): obs = copy.deepcopy( observations[env_rank:env_rank + 1].to(device)) # new kargs = obs[:, agent_idx], None, None value, action, action_log_prob, action_logit = models[agent_idx].act( *kargs) action_logits[env_rank, agent_idx] = action_logit.cpu() # new values[env_rank, agent_idx] = value.cpu() # new step_dones_np[env_rank] = 0 # new act_in_progs_np[env_rank] = 0 # new else: with actor_lock: step_done_not_prog_np = np.logical_and( step_dones_np, act_in_progs_np == 0) ranks = np.where(step_done_not_prog_np == 1)[0] act_in_progs_np[ranks] = 1 if ranks.size > 0: obs = observations[ranks].clone().to(device) for agent_idx in range(args.num_agents): with torch.no_grad(): kargs = obs[:, agent_idx], None, None value, action, action_log_prob, action_logit = models[agent_idx].act( *kargs) action_logits[ranks, agent_idx] = action_logit.cpu() values[ranks, agent_idx] = value.cpu() step_dones_np[ranks] = 0 act_in_progs_np[ranks] = 0 print('Done actor ', actor_rank)
def train_ppo_from_scratch(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(2) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, True) actor_critic = Policy( # 2-layer fully connected network envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': False, 'hidden_size': 32 }) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episode_reward_means = [] episode_reward_times = [] for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) episode_reward_means.append(np.mean(episode_rewards)) episode_reward_times.append(total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) print(episode_reward_means, episode_reward_times) return episode_reward_means, episode_reward_times
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.env_name.startswith("lab_"): gym_name, flow_json = make_lab_env(args.env_name) args.env_name = gym_name envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: " "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir + args.env_name) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) log_dir2 = os.path.expanduser(args.log_dir2 + args.env_name2) eval_log_dir2 = log_dir + "_eval" utils.cleanup_log_dir(log_dir2) utils.cleanup_log_dir(eval_log_dir2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") import json file_path = "config.json" setup_json = json.load(open(file_path, 'r')) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env_name: env_conf = setup_json[i] # 1 game envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, env_conf, False) # 2 game envs2 = make_vec_envs(args.env_name2, args.seed, args.num_processes, args.gamma, args.log_dir2, device, env_conf, False) save_model, ob_rms = torch.load('./trained_models/PongNoFrameskip-v4.pt') from a2c_ppo_acktr.cnn import CNNBase a = CNNBase(envs.observation_space.shape[0], recurrent=False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, #(obs_shape[0], ** base_kwargs) base=a, #base_kwargs={'recurrent': args.recurrent_policy} ) #actor_critic.load_state_dict(save_model.state_dict()) actor_critic.to(device) actor_critic2 = Policy(envs2.observation_space.shape, envs2.action_space, base=a) #base_kwargs={'recurrent': args.recurrent_policy}) #actor_critic2.load_state_dict(save_model.state_dict()) actor_critic2.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, actor_critic2, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts2 = RolloutStorage(args.num_steps, args.num_processes, envs2.observation_space.shape, envs2.action_space, actor_critic2.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) obs2 = envs2.reset() rollouts2.obs[0].copy_(obs2) rollouts2.to(device) episode_rewards = deque(maxlen=10) episode_rewards2 = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # if args.use_linear_lr_decay: # # decrease learning rate linearly # utils.update_linear_schedule( # agent.optimizer, j, num_updates, # agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) value2, action2, action_log_prob2, recurrent_hidden_states2, _ = actor_critic2.act( rollouts2.obs[step], rollouts2.recurrent_hidden_states[step], rollouts2.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) obs2, reward2, done2, infos2 = envs2.step(action2) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) for info2 in infos2: if 'episode' in info2.keys(): episode_rewards2.append(info2['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) masks2 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done2]) bad_masks2 = torch.FloatTensor( [[0.0] if 'bad_transition' in info2.keys() else [1.0] for info2 in infos2]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) rollouts2.insert(obs2, recurrent_hidden_states2, action2, action_log_prob2, value2, reward2, masks2, bad_masks2) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() next_value2 = actor_critic2.get_value( rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1], rollouts2.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) rollouts2.compute_returns(next_value2, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy, value_loss2, action_loss2, dist_entropy2 = agent.update( rollouts, rollouts2) rollouts.after_update() rollouts2.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) torch.save([ actor_critic2, getattr(utils.get_vec_normalize(envs2), 'ob_rms2', None) ], os.path.join(save_path, args.env_name2 + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards2), np.mean(episode_rewards2), np.median(episode_rewards2), np.min(episode_rewards2), np.max(episode_rewards2), dist_entropy2, value_loss2, action_loss2)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) ob_rms2 = utils.get_vec_normalize(envs2).ob_rms evaluate(actor_critic2, ob_rms2, args.env_name2, args.seed, args.num_processes, eval_log_dir2, device)
def train(train_states, run_dir, num_env_steps, eval_env_steps, writer, writer_name, args, init_model=None): envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) if init_model: actor_critic, env_step, model_name = init_model obs_space = actor_critic.obs_space obs_process = actor_critic.obs_process obs_module = actor_critic.obs_module print(f" [load] Loaded model {model_name} at step {env_step}") else: obs_space = envs.observation_space actor_critic = Policy(obs_space, args.obs_process, args.obs_module, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) env_step = 0 actor_critic.to(args.device) #print(actor_critic) run_name = run_dir.replace('/', '_') vid_save_dir = f"{run_dir}/videos/" try: os.makedirs(vid_save_dir) except OSError: pass ckpt_save_dir = f"{run_dir}/ckpts/" try: os.makedirs(ckpt_save_dir) except OSError: pass if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=False) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True) else: raise NotImplementedError rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() actor_critic.eval() """ try: writer.add_graph(actor_critic, obs) except ValueError: print("Unable to write model graph to tensorboard.") """ actor_critic.train() for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) episode_rewards = deque(maxlen=10) num_updates = num_env_steps // args.num_steps // args.num_processes batch_size = args.num_steps * args.num_processes start = time.time() while env_step < num_env_steps: s = time.time() if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( { k: rollouts.obs[k][step].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[step].to(args.device), rollouts.masks[step].to(args.device)) value = value.cpu() action = action.cpu() action_log_prob = action_log_prob.cpu() recurrent_hidden_states = recurrent_hidden_states.cpu() # Observe reward and next obs obs, reward, dones, infos = envs.step(action) for done, info in zip(dones, infos): env_state = info['env_state'][1] if done: writer.add_scalar(f'train_episode_x/{env_state}', info['max_x'], env_step) writer.add_scalar(f'train_episode_%/{env_state}', info['max_x'] / info['lvl_max_x'] * 100, env_step) writer.add_scalar(f'train_episode_r/{env_state}', info['sum_r'], env_step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( { k: rollouts.obs[k][-1].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[-1].to(args.device), rollouts.masks[-1].to(args.device)).detach().cpu() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() env_step += batch_size fps = batch_size / (time.time() - s) #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step) #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step) total_norm = 0 for p in list( filter(lambda p: p.grad is not None, actor_critic.parameters())): param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) obs_norm = {} for obs_name in args.obs_keys: t_norm = 0 if obs_name == 'video': md = actor_critic.base.video_module elif obs_name == 'audio': md = actor_critic.base.audio_module else: raise NotImplementedError for p in list(filter(lambda p: p.grad is not None, md.parameters())): param_norm = p.grad.data.norm(2) t_norm += param_norm.item()**2 obs_norm[obs_name] = t_norm**(1. / 2) prev_env_step = max(0, env_step + 1 - batch_size) # write training metrics for this batch, usually takes 0.003s if (env_step + 1 ) // args.write_interval > prev_env_step // args.write_interval: writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step) writer.add_scalar(f'fps/{writer_name}', fps, env_step) writer.add_scalar(f'value_loss/{writer_name}', value_loss / batch_size, env_step) writer.add_scalar(f'action_loss/{writer_name}', action_loss / batch_size, env_step) writer.add_scalar(f'dist_entropy/{writer_name}', dist_entropy / batch_size, env_step) writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(), env_step) writer.add_scalar(f'cpu_mem/{writer_name}', psutil.virtual_memory()._asdict()['percent'], env_step) for obs_name in args.obs_keys: writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}', obs_norm[obs_name], env_step) # print log to console if (env_step + 1) // args.log_interval > prev_env_step // args.log_interval: end = time.time() print(" [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format( env_step + 1, num_env_steps, end - start, fps)) if len(episode_rewards) > 0: print( " Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print( " dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}" .format(dist_entropy, value_loss, action_loss, total_norm)) start = time.time() # save model to ckpt if ((env_step + 1) // args.save_interval > prev_env_step // args.save_interval): torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop if ((env_step + 1) // args.eval_interval > prev_env_step // args.eval_interval ) and env_step < num_env_steps and eval_env_steps > 0: torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") envs.close() del envs # close does not actually get rid of envs, need to del actor_critic.eval() eval_score, e_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Evaluation score: {eval_score}") writer.add_scalar('eval_score', eval_score, env_step) actor_critic.train() envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) obs = envs.reset() # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0 for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) # final model save final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt") torch.save([ actor_critic, env_step, run_name, ], final_model_path) print( f" [save] Final model saved at step {env_step+1} to {final_model_path}" ) # final model eval envs.close() del envs eval_score = None eval_dict = None if eval_env_steps > 0: eval_score, eval_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Final model evaluation score: {eval_score:.3f}") return (actor_critic, env_step, run_name), eval_score, eval_dict
def train(): processes = [] if os.path.isdir(args.log_dir): ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir)) if ans == 'n': return logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w')) torch.set_num_threads(2) start = time.time() policy_update_time, policy_forward_time = 0, 0 step_time_env, step_time_total, step_time_rewarder = 0, 0, 0 visualize_time = 0 rewarder_fit_time = 0 envs = RL2EnvInterface(args) if args.look: looker = Looker(args.log_dir) actor_critic = Policy(envs.obs_shape, envs.action_space, base=RL2Base, base_kwargs={'recurrent': True, 'num_act_dim': envs.action_space.shape[0]}) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts.to(args.device) def copy_obs_into_beginning_of_storage(obs): obs_raw, obs_act, obs_rew, obs_flag = obs rollouts.obs[0].copy_(obs_raw) rollouts.obs_act[0].copy_(obs_act) rollouts.obs_rew[0].copy_(obs_rew) rollouts.obs_flag[0].copy_(obs_flag) for j in range(args.num_updates): obs = envs.reset() copy_obs_into_beginning_of_storage(obs) if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(args.num_updates)) episode_returns = [0 for i in range(args.trial_length)] episode_final_reward = [0 for i in range(args.trial_length)] i_episode = 0 log_marginal = 0 lambda_log_s_given_z = 0 for step in range(args.num_steps): # Sample actions policy_forward_start = time.time() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) policy_forward_time += time.time() - policy_forward_start # Obser reward and next obs step_total_start = time.time() obs, reward, done, info = envs.step(action) step_time_total += time.time() - step_total_start step_time_env += info['step_time_env'] step_time_rewarder += info['reward_time'] log_marginal += info['log_marginal'].sum().item() lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item() episode_returns[i_episode] += reward.sum().item() if all(done['episode']): episode_final_reward[i_episode] += reward.sum().item() i_episode = (i_episode + 1) % args.trial_length # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) assert all(done['trial']) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.get_obs(-1), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) policy_update_start = time.time() if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load: value_loss, action_loss, dist_entropy = 0, 0, 0 else: value_loss, action_loss, dist_entropy = agent.update(rollouts) policy_update_time += time.time() - policy_update_start rollouts.after_update() # metrics trajectories_pre = envs.trajectories_pre_current_update state_entropy_pre = calculate_state_entropy(args, trajectories_pre) trajectories_post = envs.trajectories_post_current_update state_entropy_post = calculate_state_entropy(args, trajectories_post) return_avg = rollouts.rewards.sum() / args.trials_per_update reward_avg = return_avg / (args.trial_length * args.episode_length) log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length) lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length) num_steps = (j + 1) * args.num_steps * args.num_processes num_episodes = num_steps // args.episode_length num_trials = num_episodes // args.trial_length logger.logkv('state_entropy_pre', state_entropy_pre) logger.logkv('state_entropy_post', state_entropy_post) logger.logkv('value_loss', value_loss) logger.logkv('action_loss', action_loss) logger.logkv('dist_entropy', dist_entropy) logger.logkv('return_avg', return_avg.item()) logger.logkv('reward_avg', reward_avg.item()) logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes) logger.logkv('episodes', num_episodes) logger.logkv('trials', num_trials) logger.logkv('policy_updates', (j + 1)) logger.logkv('time', time.time() - start) logger.logkv('policy_forward_time', policy_forward_time) logger.logkv('policy_update_time', policy_update_time) logger.logkv('step_time_rewarder', step_time_rewarder) logger.logkv('step_time_env', step_time_env) logger.logkv('step_time_total', step_time_total) logger.logkv('visualize_time', visualize_time) logger.logkv('rewarder_fit_time', rewarder_fit_time) logger.logkv('log_marginal_avg', log_marginal_avg) logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg) for i_episode in range(args.trial_length): logger.logkv('episode_return_avg_{}'.format(i_episode), episode_returns[i_episode] / args.trials_per_update) logger.logkv('episode_final_reward_{}'.format(i_episode), episode_final_reward[i_episode] / args.trials_per_update) if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '': save_model(args, actor_critic, envs, iteration=j) if not args.vae_freeze and j % args.rewarder_fit_period == 0: rewarder_fit_start = time.time() envs.fit_rewarder() rewarder_fit_time += time.time() - rewarder_fit_start if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '': visualize_start = time.time() if args.look: eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j) logger.logkv('eval_return_avg', eval_return_avg) for i_episode in range(args.trial_length): logger.logkv('eval_episode_return_avg_{}'.format(i_episode), eval_episode_returns[i_episode] / args.trials_per_update) logger.logkv('eval_episode_final_reward_{}'.format(i_episode), eval_episode_final_reward[i_episode] / args.trials_per_update) if args.plot: p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True) processes.append(p) visualize_time += time.time() - visualize_start logger.dumpkvs()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, True) frame_skip = 4 # frame skip if args.tb_dir[-1] != '/': args.tb_dir = args.tb_dir + '/' logger = Logger(args.tb_dir) logger.write_settings(args) if args.use_tdm: # beta scheduler if args.beta_schedule == 'const': beta_func = lambda x: float(args.beta_int) elif args.beta_schedule == 'sqrt': beta_func = lambda x: 1. / np.sqrt(x + 2) elif args.beta_schedule == 'log': beta_func = lambda x: 1. / np.log(x + 2) elif args.beta_schedule == 'linear': beta_func = lambda x: 1. / (x + 2) # bonus function variations if args.bonus_func == 'linear': bonus_func = lambda x: x + 1 elif args.bonus_func == 'square': bonus_func = lambda x: (x + 1)**2 elif args.bonus_func == 'sqrt': bonus_func = lambda x: (x + 1)**(1 / 2) elif args.bonus_func == 'log': bonus_func = lambda x: np.log(x + 1) # temporal difference module tdm = TemporalDifferenceModule( inputSize=2 * int(envs.observation_space.shape[0]), outputSize=args.time_intervals, num_fc_layers=int(args.num_layers), depth_fc_layers=int(args.fc_width), lr=float(args.opt_lr), buffer_max_length=args.buffer_max_length, buffer_RL_ratio=args.buffer_RL_ratio, frame_skip=frame_skip, tdm_epoch=args.tdm_epoch, tdm_batchsize=args.tdm_batchsize, logger=logger, bonus_func=bonus_func).to(device) #collect random trajectories sample_collector = CollectSamples(envs, args.num_processes, initial=True) tdm.buffer_rand = sample_collector.collect_trajectories( args.num_rollouts, args.steps_per_rollout) # initial training tdm.update() actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # acting for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs # envs.render() obs_old = obs.clone() obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #compute intrinsic bonus if args.use_tdm: tdm.symm_eval = True if step == args.num_steps - 1 else False reward_int = tdm.compute_bonus(obs_old, obs).float() reward += beta_func( step + j * args.num_steps) * reward_int.cpu().unsqueeze(1) if (j % args.log_interval == 0) and (step == args.num_steps - 1): logger.add_reward_intrinsic(reward_int, (j + 1) * args.num_steps * args.num_processes) #saving to buffer. rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # saving to buffer and periodic updating parameters if (args.use_tdm): tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks)) if (j % args.num_steps == 0 and j > 0): tdm.update() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch # no # save every 1-million steps if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] if j == num_updates - 1: save_here = os.path.join( save_path, args.env_name + "_step_{}M.pt".format( (j + 1) * args.num_steps * args.num_processes // 1e6)) else: save_here = os.path.join(save_path, args.env_name + "_final.pt") torch.save(save_model, save_here) # saved policy. total_num_steps = (j + 1) * args.num_processes * args.num_steps # printing outputs if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.add_reward(episode_rewards, (j + 1) * args.num_steps * args.num_processes) # # if j % args.tb_interval == 0: # # mean/std or median/1stqt? # logger.add_tdm_loss(loss, self.epoch_count*i) # evaluation process # if (args.eval_interval is not None # and len(episode_rewards) > 1 # and j % args.eval_interval == 0): # eval_envs = make_vec_envs( # args.env_name, args.seed + args.num_processes, args.num_processes, # args.gamma, eval_log_dir, args.add_timestep, device, True) # # vec_norm = get_vec_normalize(eval_envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = get_vec_normalize(envs).ob_rms # # eval_episode_rewards = [] # # obs = eval_envs.reset() # eval_recurrent_hidden_states = torch.zeros(args.num_processes, # actor_critic.recurrent_hidden_state_size, device=device) # eval_masks = torch.zeros(args.num_processes, 1, device=device) # # while len(eval_episode_rewards) < 10: # with torch.no_grad(): # _, action, _, eval_recurrent_hidden_states = actor_critic.act( # obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # # # Obser reward and next obs # # envs.render() # obs, reward, done, infos = eval_envs.step(action) # # eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] # for done_ in done]) # for info in infos: # if 'episode' in info.keys(): # eval_episode_rewards.append(info['episode']['r']) # # eval_envs.close() # # print(" Evaluation using {} episodes: mean reward {:.5f}\n". # format(len(eval_episode_rewards), # np.mean(eval_episode_rewards))) # # plotting # if args.vis and j % args.vis_interval == 0: # try: # # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) # except IOError: # pass #if done save::::::::::: logger.save()
def main(args): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) eval_log_dir = args.log_dir + "_eval" try: os.makedirs(eval_log_dir) except OSError: files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' if args.eval_render: render_env = make_vec_envs(args.env_name, args.seed, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) torch.set_num_threads(1) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Uses gpu/cuda by default device = torch.device("cuda:0" if args.cuda else "cpu") # Only if running visdoom if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # Set up actor_critic actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # Set algorithm with actor critic and use to learn if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, args.env_name + "-AvgRwrd" + str(int(np.mean(episode_rewards))) + ".pt")) print("Saving Model") total_num_steps = (j + 1) * args.num_processes * args.num_steps # Logs every log_interval steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) if args.eval_render: show_model(render_env, actor_critic) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) if args.load_policy is not None: actor_critic, ob_rms = torch.load(args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() snapshot_counter = 0 last_delete = -1 try: os.makedirs(os.path.join(args.save_dir, args.algo)) except OSError: pass log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'), 'w') for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + "epoch_{:07d}.pt".format(j))) snapshot_counter += 1 last_delete += 1 if snapshot_counter > 100: os.system('rm ' + os.path.join( save_path, args.env_name + 'epoch_{:07d}.py'.format(last_delete))) snapshot_counter -= 1 total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) print(log_info) sys.stdout.flush() log_out_file.write(log_info) log_out_file.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.write( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.flush() sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): all_episode_rewards = [] ### 记录 6/29 all_temp_rewards = [] ### 记录 6/29 args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('num_updates ', num_updates) print('num_steps ', args.num_steps) count = 0 h5_path = './data/' + args.env_name if not os.path.exists(h5_path): os.makedirs(h5_path) h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count) data = {} data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] episode_step = 0 for j in range(num_updates): ### num-steps temp_states = [] temp_actions = [] temp_rewards = [] temp_done = [] temp_lenthgs = [] if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if j == 0 and step == 0: print('obs ', type(rollouts.obs[step]), rollouts.obs[step].shape) print('hidden_states ', type(rollouts.recurrent_hidden_states[step]), rollouts.recurrent_hidden_states[step].shape) print('action ', type(action), action.shape) print('action prob ', type(action_log_prob), action_log_prob.shape) print('-' * 20) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #print(infos) #print(reward) temp_states += [np.array(rollouts.obs[step].cpu())] temp_actions += [np.array(action.cpu())] #temp_rewards += [np.array(reward.cpu())] temp_rewards += [np.array([infos[0]['myrewards']]) ] ### for halfcheetah不能直接用 reward !! 6/29 temp_done += [np.array(done)] if j == 0 and step == 0: print('obs ', type(obs), obs.shape) print('reward ', type(reward), reward.shape) print('done ', type(done), done.shape) print('infos ', len(infos)) for k, v in infos[0].items(): print(k, v.shape) print() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) all_episode_rewards += [info['episode']['r']] ### 记录 6/29 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) temp_lengths = len(temp_states) temp_states = np.concatenate(temp_states) temp_actions = np.concatenate(temp_actions) temp_rewards = np.concatenate(temp_rewards) temp_done = np.concatenate(temp_done) #print('temp_lengths',temp_lengths) #print('temp_states', temp_states.shape) #print('temp_actions', temp_actions.shape) #print('temp_rewards', temp_rewards.shape) if j > int(0.4 * num_updates): data['states'] += [temp_states] data['actions'] += [temp_actions] data['rewards'] += [temp_rewards] data['lengths'] += [temp_lengths] data['done'] += [temp_done] #print('temp_lengths',data['lengths'].shape) #print('temp_states', data['states'].shape) #print('temp_actions', data['actions'].shape) #print('temp_rewards', data['rewards'].shape) if args.save_expert and len(data['states']) >= 100: with h5py.File(h5_filename, 'w') as f: f['states'] = np.array(data['states']) f['actions'] = np.array(data['actions']) f['rewards'] = np.array(data['rewards']) f['done'] = np.array(data['done']) f['lengths'] = np.array(data['lengths']) #print('f_lengths',f['lengths'].shape) #print('f_states', f['states'].shape) #print('f_actions', f['actions'].shape) #print('f_rewards', f['rewards'].shape) count += 1 h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % ( count) data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards) ### 保存记录 6/29 #print(temp_rewards) print("temp rewards size", temp_rewards.shape, "mean", np.mean(temp_rewards), "min", np.min(temp_rewards), "max", np.max(temp_rewards)) all_temp_rewards += [temp_rewards] np.savez(os.path.join(save_path, args.env_name + "_%d" % (args.seed)), episode=all_episode_rewards, timestep=all_temp_rewards) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''data['states'] = np.array(data['states'])
def main(): args = get_args() writer = SummaryWriter(os.path.join('logs', args.save_name), ) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs( basic_env.BasicFlatDiscreteEnv, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, task='lift', gripper_type='RobotiqThreeFingerDexterousGripper', robot='Panda', controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION', horizon=1000, reward_shaping=True) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base=Surreal, # base=OpenAI, # base=MLP_ATTN, base_kwargs={ 'recurrent': args.recurrent_policy, # 'dims': basic_env.BasicFlatEnv().modality_dims 'config': dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc) }) print(actor_critic) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes best_reward = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr']) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps if len(episode_rewards) > 1: writer.add_scalar('loss/value', value_loss, total_num_steps) writer.add_scalar('loss/policy', action_loss, total_num_steps) writer.add_scalar('experiment/num_updates', j, total_num_steps) writer.add_scalar('experiment/FPS', int(total_num_steps / (end - start)), total_num_steps) writer.add_scalar('experiment/EPISODE MEAN', np.mean(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MEDIAN', np.median(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MIN', np.min(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPSIDOE MAX', np.max(episode_rewards), total_num_steps) rollouts.after_update() # save for every interval-th episode or for the last epoch if len(episode_rewards) > 1 and args.save_dir != "": rew = np.mean(episode_rewards) if rew > best_reward: best_reward = rew print('saved with best reward', rew) save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.save_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close()
def get_agent(agent_name, args, obs_space, input_structure, act_space, save_dir, n_ref=0, is_ref=False): from a2c_ppo_acktr import algo from a2c_ppo_acktr.model import Policy, AttentionBase, LinearBase if args.use_attention: actor_critic = Policy(obs_space.shape, act_space, AttentionBase, base_kwargs={ 'recurrent': args.recurrent_policy, 'input_structure': input_structure }) elif args.use_linear: actor_critic = Policy(obs_space.shape, act_space, LinearBase) else: # if not is_ref: # print("A") actor_critic = Policy(obs_space.shape, act_space, action_activation=args.action_activation, base_kwargs={ 'recurrent': args.recurrent_policy, 'critic_dim': n_ref * 2 + 1, 'is_ref': is_ref, 'predict_reward': args.use_reward_predictor, 'timestep_mask': args.use_timestep_mask, "rnd": args.use_rnd, 'hidden_size': args.hidden_size, 'activation': args.activation }) # if not is_ref: # print("B") # print("actor critic got") # if not is_ref: # print("!!@!@") if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(agent_name, actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, clip_grad_norm=not args.no_grad_norm_clip, task=args.task, direction=args.direction, save_dir=save_dir, args=args, is_ref=is_ref) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) elif args.algo == 'loaded-dice': agent = algo.LoadedDiCE(actor_critic, args.dice_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.dice_lambda, args.episode_steps, args.dice_task, lr=args.lr, eps=args.eps, save_dir=save_dir) elif args.algo == 'hessian': agent = algo.Hessian(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, clip_grad_norm=not args.no_grad_norm_clip, task=args.task, direction=args.direction, args=args) else: raise ValueError("algo {} not supported".format(args.algo)) return actor_critic, agent