def __init__(self, envs, args): self.envs = envs self.args = args obs_shape = self.envs.observation_space.shape self.obs_shape = (obs_shape[0] * self.args.num_stack, *obs_shape[1:]) self.actor_critic = self.select_network() self.optimizer = self.select_optimizer() if self.args.cuda: self.actor_critic.cuda() self.action_shape = 1 if self.envs.action_space.__class__.__name__ == "Discrete" \ else self.envs.action_space.shape[0] self.current_obs = torch.zeros(self.args.num_processes, *self.obs_shape) obs = self.envs.reset() self.update_current_obs(obs) self.rollouts = RolloutStorage(self.args.num_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.actor_critic.state_size) self.rollouts.observations[0].copy_(self.current_obs) # These variables are used to compute average rewards for all processes. self.episode_rewards = torch.zeros([self.args.num_processes, 1]) self.final_rewards = torch.zeros([self.args.num_processes, 1]) if self.args.cuda: self.current_obs = self.current_obs.cuda() self.rollouts.cuda() if self.args.vis: from visdom import Visdom self.viz = Visdom(port=args.port) self.win = None
def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams
def __init__(self, net, env, params, is_cuda=True, seed=42, log_dir=abspath("/data/patrik")): super().__init__() # constants self.timestamp = strftime("%Y-%m-%d %H_%M_%S", gmtime()) self.seed = seed self.is_cuda = torch.cuda.is_available() and is_cuda # parameters self.params = params """Logger""" self.logger = TemporalLogger(self.params.env_name, self.timestamp, log_dir, *["rewards", "features"]) self.checkpointer = AgentCheckpointer(self.params.env_name, self.params.num_updates, self.timestamp) """Environment""" self.env = env self.storage = RolloutStorage(self.params.rollout_size, self.params.num_envs, self.env.observation_space.shape[0:-1], self.params.n_stack, is_cuda=self.is_cuda) """Network""" self.net = net if self.is_cuda: self.net = self.net.cuda()
def __init__(self,args): self.args = args self.device = torch.device('cuda') if args.cuda else torch.device('cpu') dummy_env = gym.make(self.args.env_name) self.actor = ACNet(dummy_env.action_space.n,args.feedforward) del dummy_env if args.load_dir is not None: actorState = torch.load(args.load_dir,map_location=lambda storage, loc: storage) if args.continue_training: self.actor.load_state_dict(actorState) print("Loaded pretrained model successfully") if args.transfer: self.actor.load_autoturn_model(actorState) if args.cuda: self.actor.cuda() self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=self.args.lr) self.env_list = [make_env(self.args.env_name,self.args.seed,i) for i in range(self.args.num_processes)] if self.args.num_processes > 1: self.envs = gym_vecenv.SubprocVecEnv(self.env_list) else: self.envs = gym_vecenv.DummyVecEnv(self.env_list) if len(self.envs.observation_space.shape) == 1: self.envs = gym_vecenv.VecNormalize(self.envs) self.obs_shape = self.envs.observation_space.shape self.obs_shape = (self.obs_shape[0] * args.num_stack, *self.obs_shape[1:]) self.state_shape = 1 if args.feedforward else 256 self.rollouts = RolloutStorage(self.args.num_fwd_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.state_shape) self.num_updates = int(args.num_frames)//args.num_fwd_steps//args.num_processes self.current_obs = torch.zeros(self.args.num_processes,*self.obs_shape) self.writer = SummaryWriter(log_dir=self.args.save_dir) self.fortress_threshold = 650 self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer, mode='max',factor=0.2,patience=15,verbose=True,threshold=1e-3, threshold_mode='rel')
def reset_envs(storage_length): rollouts = RolloutStorage(num_tasks, storage_length, num_processes_per_task, obs_shape, envs.action_space, loss) current_obs = torch.zeros(num_tasks, num_processes_per_task, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack, num_tasks, num_processes_per_task) for task in range(num_tasks): rollouts.obs[task, 0].copy_(current_obs[task]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) final_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_length = torch.zeros([num_tasks, num_processes_per_task, 1]) final_length = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) return (rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length, episode_terminations, final_terminations, master_terminations, final_master_terminations)
def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] # # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['action_space']) # else: self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['action_space'], hparams['n_contexts']) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['action_space']) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams
def initialize(self, args, obs_shape, action_space, num_training_per_episode): self._optimizer = optim.Adam(self._actor_critic.parameters(), args.lr, eps=args.eps) self._rollout = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode)
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts self.rollouts_list = RolloutStorage_list()
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.ppo_epoch = hparams['ppo_epoch'] self.batch_size = hparams['batch_size'] self.clip_param = hparams['clip_param'] if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() self.eps = hparams['eps'] # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # if hparams['lr_schedule'] == 'linear': self.init_lr = hparams['lr'] self.final_lr = hparams['final_lr'] # lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr) # self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func) # self.current_lr = hparams['lr'] self.actor_critic = actor_critic self.rollouts = rollouts self.old_model = copy.deepcopy(self.actor_critic)
def __init__(self, trial_context: PyTorchTrialContext) -> None: self.context = trial_context self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}" # self.logger = TorchWriter() self.n_stack = self.context.get_hparam("n_stack") self.env_name = self.context.get_hparam("env_name") self.num_envs = self.context.get_hparam("num_envs") self.rollout_size = self.context.get_hparam("rollout_size") self.curiousity = self.context.get_hparam("curiousity") self.lr = self.context.get_hparam("lr") self.icm_beta = self.context.get_hparam("icm_beta") self.value_coeff = self.context.get_hparam("value_coeff") self.entropy_coeff = self.context.get_hparam("entropy_coeff") self.max_grad_norm = self.context.get_hparam("max_grad_norm") env = make_atari_env(self.env_name, num_env=self.num_envs, seed=42) self.env = VecFrameStack(env, n_stack=self.n_stack) eval_env = make_atari_env(self.env_name, num_env=1, seed=42) self.eval_env = VecFrameStack(eval_env, n_stack=self.n_stack) # constants self.in_size = self.context.get_hparam("in_size") # in_size self.num_actions = env.action_space.n def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) self.feat_enc_net = self.context.Model( FeatureEncoderNet(self.n_stack, self.in_size)) self.actor = self.context.Model( init_(nn.Linear(self.feat_enc_net.hidden_size, self.num_actions))) self.critic = self.context.Model( init_(nn.Linear(self.feat_enc_net.hidden_size, 1))) self.set_recurrent_buffers(self.num_envs) params = list(self.feat_enc_net.parameters()) + list( self.actor.parameters()) + list(self.critic.parameters()) self.opt = self.context.Optimizer(torch.optim.Adam(params, self.lr)) self.is_cuda = torch.cuda.is_available() self.storage = RolloutStorage(self.rollout_size, self.num_envs, self.env.observation_space.shape[0:-1], self.n_stack, is_cuda=self.is_cuda, value_coeff=self.value_coeff, entropy_coeff=self.entropy_coeff) obs = self.env.reset() self.storage.states[0].copy_(self.storage.obs2tensor(obs)) self.writer = SummaryWriter(log_dir="/tmp/tensorboard") self.global_eval_count = 0
def __init__(self, net, env, num_envs, n_stack, rollout_size=5, num_updates=2500000, max_grad_norm=0.5, value_coeff=0.5, entropy_coeff=0.02, tensorboard_log=False, log_path="./log", is_cuda=True, seed=42): super().__init__() # constants self.num_envs = num_envs self.rollout_size = rollout_size self.num_updates = num_updates self.n_stack = n_stack self.seed = seed self.max_grad_norm = max_grad_norm # loss scaling coefficients self.is_cuda = torch.cuda.is_available() and is_cuda # objects """Tensorboard logger""" self.writer = SummaryWriter( comment="statistics", log_dir=log_path) if tensorboard_log else None """Environment""" self.env = env self.storage = RolloutStorage(self.rollout_size, self.num_envs, self.env.observation_space.shape[0:-1], self.n_stack, is_cuda=self.is_cuda, value_coeff=value_coeff, entropy_coeff=entropy_coeff, writer=self.writer) """Network""" self.net = net self.net.a2c.writer = self.writer if self.is_cuda: self.net = self.net.cuda()
def __init__(self, args): self.args = args self.device = torch.device( 'cuda' ) if args.cuda and torch.cuda.is_available() else torch.device('cpu') if self.args.env_name == 'ant': from rllab.envs.mujoco.ant_env import AntEnv env = AntEnv() # set the target velocity direction (for learning sub-policies) env.velocity_dir = self.args.velocity_dir env.penalty = self.args.penalty # use gym environment observation env.use_gym_obs = self.args.use_gym_obs # use gym environment reward env.use_gym_reward = self.args.use_gym_reward elif self.args.env_name == 'swimmer': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = SwimmerEnv() env.velocity_dir = self.args.velocity_dir else: raise NotImplementedError self.env = normalize(env) self.reset_env() self.obs_shape = self.env.observation_space.shape self.actor_critic = self.select_network().to(self.device) self.optimizer = self.select_optimizer() # list of RolloutStorage objects self.episodes_rollout = [] # concatenation of all episodes' rollout self.rollouts = RolloutStorage(self.device) # this directory is used for tensorboardX only self.writer = SummaryWriter(args.log_dir + self.args.velocity_dir) self.episodes = 0 self.episode_steps = [] self.train_rewards = []
def __init__(self, envs, cuda, num_steps, num_processes, obs_shape, lr, eps, alpha, use_gae, gamma, tau, value_loss_coef, entropy_coef): if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() # if args.algo == 'a2c': self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps, alpha) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if cuda: rollouts.cuda() self.actor_critic = actor_critic self.rollouts = rollouts self.use_gae = use_gae self.gamma = gamma self.tau = tau self.obs_shape = obs_shape self.action_shape = action_shape self.num_steps = num_steps self.num_processes = num_processes self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef
def initialize(self, args, obs_shape, action_space, num_training_per_episode, num_episodes, total_steps, num_epoch, optimizer_state_dict, num_steps, uniform_v, uniform_v_prior): params = self._actor_critic.parameters() self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps) if optimizer_state_dict: self._optimizer.load_state_dict(optimizer_state_dict) if args.use_lr_scheduler: self._scheduler = optim.lr_scheduler.ReduceLROnPlateau( self._optimizer, mode='min', verbose=True) self._rollout = RolloutStorage(num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode) self.num_episodes = num_episodes self.total_steps = total_steps self.num_epoch = num_epoch self.uniform_v = uniform_v self.uniform_v_prior = uniform_v_prior
def rollout_episode(self, test=False, render=False): rollout = RolloutStorage(self.device) self.reset_env() step = 0 done = False while not done: step += 1 with torch.no_grad(): value, action, action_logprob = self.actor_critic.act( self.current_obs, deterministic=test is True) cpu_actions = action.data.squeeze(1).cpu().numpy()[0] next_obs, reward, done, info = self.env.step(cpu_actions) next_obs = torch.Tensor(next_obs).view(1, -1).to(self.device) if render: self.env.render() # a constant reward scaling factor can be introduced to stabilise training and prevent large value losses r = reward * self.args.reward_scale done = done or step == self.args.episode_max_length mask = 1.0 if not done else 0.0 rollout.insert(self.current_obs, action.data, r, value.data, action_logprob.data, mask) self.current_obs.copy_(next_obs) if not test: next_value = self.actor_critic(self.current_obs)[0].data rollout.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau) self.episode_steps.append(step) return rollout
def __init__( self, agent_id, obs_space, action_space, lr, adam_eps, recurrent_policy, num_steps, num_processes, device, ): self.agent_id = agent_id self.obs_size = flatdim(obs_space) self.action_size = flatdim(action_space) self.obs_space = obs_space self.action_space = action_space self.model = Policy( obs_space, action_space, base_kwargs={"recurrent": recurrent_policy}, ) self.storage = RolloutStorage( obs_space, action_space, self.model.recurrent_hidden_state_size, num_steps, num_processes, ) self.model.to(device) self.optimizer = optim.Adam(self.model.parameters(), lr, eps=adam_eps) # self.intr_stats = RunningStats() self.saveables = { "model": self.model, "optimizer": self.optimizer, }
def get_storage(env, num_steps, num_processes, obs_shape, action_space): rollouts = RolloutStorage(num_steps, num_processes, obs_shape, action_space) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = env.reset() current_obs = update_current_obs(obs, current_obs, env) rollouts.observations[0].copy_(current_obs) storage = { 'rollouts': rollouts, 'episode_rewards': episode_rewards, 'final_rewards': final_rewards, 'current_obs': current_obs } return storage
args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, env.robot_dict[args.env_name].action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = env.robot_dict[args.env_name].observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = env.reset() update_current_obs(obs)
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() # def update2(self, discrim_error): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = -discrim_error - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() # #with reverse # def update2(self, discrim_error, discrim_error_reverse): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() #avg empowrement rather than avg error def update2(self, discrim_error, discrim_error_reverse): # discrim_error: [S,P] # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # next_value = next_value.data # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # print (torch.mean(discrim_error, dim=0)) # print (discrim_error) discrim_error_reverse = discrim_error_reverse.view( self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) #[S,P,1] discrim_error = discrim_error.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse)/2. - action_log_probs.detach() #[S,P,1] val_to_maximize = -discrim_error - action_log_probs.detach() #[S,P,1] val_to_maximize = val_to_maximize.view(self.num_steps, self.num_processes) #[S,P] discrim_error_unmodified = val_to_maximize.data.clone() discrim_error = val_to_maximize.data # self.returns[-1] = next_value divide_by = torch.ones(self.num_processes).cuda() for step in reversed(range(discrim_error.size(0) - 1)): divide_by += 1 ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze( self.rollouts.masks[step + 1]) discrim_error_unmodified[ step] = ttmp + discrim_error_unmodified[step] discrim_error[step] = discrim_error_unmodified[step] / divide_by divide_by = divide_by * torch.squeeze( self.rollouts.masks[step + 1]) val_to_maximize = Variable( discrim_error.view(self.num_steps, self.num_processes, 1)) # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] # advantages = Variable(self.rollouts.returns[:-1]) - values # print (values) # print (discrim_error_reverse.size()) #[S,P] # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = discrim_error baseline = torch.mean(val_to_maximize) advantages = val_to_maximize - baseline #- values #(-.7)#values # value_loss = advantages.pow(2).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() action_loss = -(advantages.detach() * action_log_probs).mean() # print (grad_sum) # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # cost =- grad_sum self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams
def main(): torch.set_num_threads(1) if args.vis: summary_writer = tf.summary.FileWriter(args.save_dir) envs = [make_env(i, args=args) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1 and args.env_name not in [ 'OverCooked' ]: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def get_onehot(num_class, action): one_hot = np.zeros(num_class) one_hot[action] = 1 one_hot = torch.from_numpy(one_hot).float() return one_hot if args.policy_type == 'shared_policy': actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True, ) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: actor_critic.load_state_dict( torch.load(args.save_dir + '/trained_learner.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 while True: if num_trained_frames > args.num_frames: break for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step], ) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) actor_critic.save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush() elif args.policy_type == 'hierarchical_policy': num_subpolicy = args.num_subpolicy update_interval = args.hierarchy_interval while len(num_subpolicy) < args.num_hierarchy - 1: num_subpolicy.append(num_subpolicy[-1]) while len(update_interval) < args.num_hierarchy - 1: update_interval.append(update_interval[-1]) if args.num_hierarchy == 1: update_interval = [1] num_subpolicy = [envs.action_space.n] # print(envs.action_space.n) # print(stop) actor_critic = {} rollouts = {} actor_critic['top'] = EHRL_Policy(obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), 128, args.recurrent_policy, 'top') rollouts['top'] = EHRL_RolloutStorage( int(args.num_steps / update_interval[-1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), actor_critic['top'].state_size) for hie_id in range(args.num_hierarchy - 1): if hie_id > 0: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( int(args.num_steps / update_interval[hie_id - 1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) else: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( args.num_steps, args.num_processes, obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: for key in actor_critic: actor_critic[key].cuda() agent = {} for ac_key in actor_critic: if args.algo == 'a2c': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent[ac_key] = algo.PPO( actor_critic[ac_key], args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, acktr=True, ) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) for obs_key in rollouts: rollouts[obs_key].observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() for rol_key in rollouts: rollouts[rol_key].cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: for save_key in actor_critic: actor_critic[save_key].load_state_dict( torch.load(args.save_dir + '/trained_learner_' + save_key + '.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 onehot_mem = {} reward_mem = {} if args.num_hierarchy > 1: update_flag = np.zeros(args.num_hierarchy - 1, dtype=np.uint8) else: update_flag = np.zeros(1, dtype=np.uint8) step_count = 0 value = {} next_value = {} action = {} action_log_prob = {} states = {} while True: if num_trained_frames > args.num_frames: break step_count = 0 for step in range(args.num_steps): if step_count % update_interval[-1] == 0: with torch.no_grad(): value['top'], action['top'], action_log_prob[ 'top'], states['top'] = actor_critic['top'].act( rollouts['top'].observations[update_flag[-1]], rollouts['top'].one_hot[update_flag[-1]], rollouts['top'].states[update_flag[-1]], rollouts['top'].masks[update_flag[-1]], ) update_flag[-1] += 1 onehot_mem[str(args.num_hierarchy - 1)] = get_onehot( num_subpolicy[-1], action['top']) onehot_mem[str(args.num_hierarchy)] = get_onehot(1, 0) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[interval_id] == 0: with torch.no_grad(): value[str(interval_id+1)], action[str(interval_id+1)], action_log_prob[str(interval_id+1)], states[str(interval_id+1)] = \ actor_critic[str(interval_id+1)].act( rollouts[str(interval_id+1)].observations[update_flag[interval_id]], rollouts[str(interval_id+1)].one_hot[update_flag[-1]], rollouts[str(interval_id+1)].states[update_flag[interval_id]], rollouts[str(interval_id+1)].masks[update_flag[interval_id]], ) update_flag[interval_id] += 1 onehot_mem[str(interval_id + 1)] = get_onehot( num_subpolicy[interval_id], action[str(interval_id + 1)]) # Sample actions if args.num_hierarchy > 1: with torch.no_grad(): value['0'], action['0'], action_log_prob['0'], states[ '0'] = actor_critic['0'].act( rollouts['0'].observations[step], rollouts['0'].one_hot[step], rollouts['0'].states[step], rollouts['0'].masks[step], ) cpu_actions = action['0'].squeeze(1).cpu().numpy() else: cpu_actions = action['top'].squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) for reward_id in range(args.num_hierarchy - 1): try: reward_mem[str(reward_id)] += [reward_raw[0]] except Exception as e: reward_mem[str(reward_id)] = reward_raw[0] episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.num_hierarchy > 1: rollouts['0'].insert(current_obs, states['0'], action['0'], onehot_mem['1'], action_log_prob['0'], value['0'], reward, masks) if step_count % update_interval[-1] == 0: if args.num_hierarchy > 1: reward_mean = np.mean( np.array(reward_mem[str(args.num_hierarchy - 2)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward_mean, masks) reward_mem[str(args.num_hierarchy - 2)] = [] else: rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward, masks) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[ interval_id] == 0 or done[0]: reward_mean = np.mean( np.array(reward_mem[str(interval_id)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts[str(interval_id + 1)].insert( current_obs, states[str(interval_id + 1)], action[str(interval_id + 1)], onehot_mem[str(interval_id + 2)], action_log_prob[str(interval_id + 1)], value[str(interval_id + 1)], reward_mean, masks) reward_mem[str(interval_id)] = [] step_count += 1 if args.num_hierarchy > 1: with torch.no_grad(): next_value['0'] = actor_critic['0'].get_value( rollouts['0'].observations[-1], rollouts['0'].one_hot[-1], rollouts['0'].states[-1], rollouts['0'].masks[-1], ).detach() rollouts['0'].compute_returns(next_value['0'], args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent['0'].update( rollouts['0'], add_onehot=True) rollouts['0'].after_update() with torch.no_grad(): next_value['top'] = actor_critic['top'].get_value( rollouts['top'].observations[-1], rollouts['top'].one_hot[-1], rollouts['top'].states[-1], rollouts['top'].masks[-1], ).detach() rollouts['top'].compute_returns(next_value['top'], args.use_gae, args.gamma, args.tau) if args.num_hierarchy > 1: _, _, _ = agent['top'].update(rollouts['top'], add_onehot=True) else: value_loss, action_loss, dist_entropy = agent['top'].update( rollouts['top'], add_onehot=True) rollouts['top'].after_update() update_flag[-1] = 0 if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): with torch.no_grad(): next_value[str(interval_id + 1)] = actor_critic[str( interval_id + 1)].get_value( rollouts[str(interval_id + 1)].observations[-1], rollouts[str(interval_id + 1)].one_hot[-1], rollouts[str(interval_id + 1)].states[-1], rollouts[str(interval_id + 1)].masks[-1], ).detach() rollouts[str(interval_id + 1)].compute_returns( next_value[str(interval_id + 1)], args.use_gae, args.gamma, args.tau) _, _, _ = agent[str(interval_id + 1)].update( rollouts[str(interval_id + 1)], add_onehot=True) rollouts[str(interval_id + 1)].after_update() update_flag[interval_id] = 0 num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) for key_store in actor_critic: actor_critic[key].save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush()
class a2c(object): def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.action_size = hparams['action_size'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1] / 255., volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.action_size = hparams['action_size'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() self.hparams = hparams
help='gae lambda parameter (default: 0.95)') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor for rewards (default: 0.99)') parser.add_argument('--num-steps', type=int, default=20, help='number of forward steps in A2C (default: 5)') parser.add_argument('--num-processes', type=int, default=1, help='how many training CPU processes to use (default: 1)') args = parser.parse_args() rollouts = RolloutStorage(args.num_steps, args.num_processes, 17, 6) if os.path.exists("./csvfiles/") is False: os.makedirs("./csvfiles/") class AddBias(nn.Module): def __init__(self, bias): super(AddBias, self).__init__() self._bias = nn.Parameter(bias.unsqueeze(1)) def forward(self, x): bias = self._bias.t().view(1, -1) return x + bias
allow_early_resets=True) actor = Policy(num_inputs=envs.observation_space.shape[0], num_outputs=envs.action_space.shape[0], hidden_size=64) critic = Value(num_inputs=envs.observation_space.shape[0], hidden_size=64) actor.to(device) critic.to(device) agent = STORM_LVC(actor=actor, critic=critic, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, alpha_initial=1) rollouts = RolloutStorage(num_steps=OUTER_BATCHSIZE, num_processes=NUM_PROCESS, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=1) inner_rollouts = RolloutStorage(num_steps=INNER_BATCHSIZE, num_processes=NUM_PROCESS, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=1) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) inner_rollouts.obs[0].copy_(obs) inner_rollouts.to(device) episode_rewards = deque(maxlen=10)
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('Creating envs: {}'.format(args.env_name)) envs = test_mp_envs(args.env_name, args.num_processes) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] num_episodes = [0 for _ in range(args.num_processes)] last_index = 0 print('Starting ! ') start = time.time() for j in tqdm(range(num_updates)): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) for info_num, info in enumerate(infos): if info_num == 0: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # end_episode_to_viz(writer, info, info_num, num_episodes[info_num]) num_episodes[info_num] += 1 plot_rewards(episode_rewards, args) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) losses = agent.update(rollouts) rollouts.after_update()
def main(): torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) # norm_envs = get_vec_normalize(envs) # norm_envs = envs # norm_envs.eval() # norm_envs.ob_rms = 1 # print(envs.ob_rms) # ss('hi') if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_recurrent_policy}) actor_critic.to(device) # print(actor_critic.is_recurrent) # print(actor_critic.gru) # ss('hi') agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): if args_use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, args_lr) for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # ss('dissecting actor critic. act') # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action+1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: # self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() def no_update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() self.optimizer.zero_grad()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
def main(): global args args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.vis = not args.no_vis # Set options if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options = yaml.load(handle) if args.vis_path_opt is not None: with open(args.vis_path_opt, 'r') as handle: vis_options = yaml.load(handle) print('## args') pprint(vars(args)) print('## options') pprint(options) # Put alg_%s and optim_%s to alg and optim depending on commandline options['use_cuda'] = args.cuda options['trial'] = args.trial options['alg'] = options['alg_%s' % args.algo] options['optim'] = options['optim_%s' % args.algo] alg_opt = options['alg'] alg_opt['algo'] = args.algo model_opt = options['model'] env_opt = options['env'] env_opt['env-name'] = args.env_name log_opt = options['logs'] optim_opt = options['optim'] model_opt['time_scale'] = env_opt['time_scale'] if model_opt['mode'] in ['baselinewtheta', 'phasewtheta']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] model_opt['theta_sz'] = env_opt['theta_sz'] elif model_opt['mode'] in ['baseline_lowlevel', 'phase_lowlevel']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] # Check asserts assert (model_opt['mode'] in [ 'baseline', 'baseline_reverse', 'phasesimple', 'phasewstate', 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel', 'interpolate', 'cyclic', 'maze_baseline', 'maze_baseline_wphase' ]) assert (args.algo in ['a2c', 'ppo', 'acktr']) if model_opt['recurrent_policy']: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' # Set seed - just make the seed the trial number seed = args.trial torch.manual_seed(seed) if args.cuda: torch.cuda.manual_seed(seed) # Initialization num_updates = int(optim_opt['num_frames'] ) // alg_opt['num_steps'] // alg_opt['num_processes'] torch.set_num_threads(1) # Print warning print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") # Set logging / load previous checkpoint logpath = os.path.join(log_opt['log_base'], model_opt['mode'], log_opt['exp_name'], args.algo, args.env_name, 'trial%d' % args.trial) if len(args.resume) > 0: assert (os.path.isfile(os.path.join(logpath, args.resume))) ckpt = torch.load(os.path.join(logpath, 'ckpt.pth.tar')) start_update = ckpt['update_count'] else: # Make directory, check before overwriting if os.path.isdir(logpath): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( logpath, default=False)): os.system('rm -rf ' + logpath) else: return os.system('mkdir -p ' + logpath) start_update = 0 # Save options and args with open(os.path.join(logpath, os.path.basename(args.path_opt)), 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(os.path.join(logpath, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Save git info as well os.system('git status > %s' % os.path.join(logpath, 'git_status.txt')) os.system('git diff > %s' % os.path.join(logpath, 'git_diff.txt')) os.system('git show > %s' % os.path.join(logpath, 'git_show.txt')) # Set up plotting dashboard dashboard = Dashboard(options, vis_options, logpath, vis=args.vis, port=args.port) # If interpolate mode, choose states if options['model']['mode'] == 'phase_lowlevel' and options['env'][ 'theta_space_mode'] == 'pretrain_interp': all_states = torch.load(env_opt['saved_state_file']) s1 = random.choice(all_states) s2 = random.choice(all_states) fixed_states = [s1, s2] elif model_opt['mode'] == 'interpolate': all_states = torch.load(env_opt['saved_state_file']) s1 = all_states[env_opt['s1_ind']] s2 = all_states[env_opt['s2_ind']] fixed_states = [s1, s2] else: fixed_states = None # Create environments dummy_env = make_env(args.env_name, seed, 0, logpath, options, args.verbose) dummy_env = dummy_env() envs = [ make_env(args.env_name, seed, i, logpath, options, args.verbose, fixed_states) for i in range(alg_opt['num_processes']) ] if alg_opt['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Get theta_sz for models (if applicable) dummy_env.reset() if model_opt['mode'] == 'baseline_lowlevel': model_opt['theta_sz'] = dummy_env.env.theta_sz elif model_opt['mode'] == 'phase_lowlevel': model_opt['theta_sz'] = dummy_env.env.env.theta_sz if 'theta_sz' in model_opt: env_opt['theta_sz'] = model_opt['theta_sz'] # Get observation shape obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * env_opt['num_stack'], *obs_shape[1:]) # Do vec normalize, but mask out what we don't want altered if len(envs.observation_space.shape) == 1: ignore_mask = np.zeros(envs.observation_space.shape) if env_opt['add_timestep']: ignore_mask[-1] = 1 if model_opt['mode'] in [ 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel' ]: theta_sz = env_opt['theta_sz'] if env_opt['add_timestep']: ignore_mask[-(theta_sz + 1):] = 1 else: ignore_mask[-theta_sz:] = 1 if args.finetune_baseline: ignore_mask = dummy_env.unwrapped._get_obs_mask() freeze_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() if env_opt['add_timestep']: ignore_mask = np.concatenate([ignore_mask, [1]]) freeze_mask = np.concatenate([freeze_mask, [0]]) ignore_mask = (ignore_mask + freeze_mask > 0).astype(float) envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) else: envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=env_opt['add_timestep'], noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) # Set up algo monitoring alg_filename = os.path.join(logpath, 'Alg.Monitor.csv') alg_f = open(alg_filename, "wt") alg_f.write('# Alg Logging %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] alg_logger = csv.DictWriter(alg_f, fieldnames=alg_fields) alg_logger.writeheader() alg_f.flush() # Create the policy network actor_critic = Policy(obs_shape, envs.action_space, model_opt) if args.cuda: actor_critic.cuda() # Create the agent if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, alg_opt['clip_param'], alg_opt['ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) rollouts = RolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(alg_opt['num_processes'], *obs_shape) # Update agent with loaded checkpoint if len(args.resume) > 0: # This should update both the policy network and the optimizer agent.load_state_dict(ckpt['agent']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif len(args.other_resume) > 0: ckpt = torch.load(args.other_resume) # This should update both the policy network agent.actor_critic.load_state_dict(ckpt['agent']['model']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif args.finetune_baseline: # Load the model based on the trial number ckpt_base = options['lowlevel']['ckpt'] ckpt_file = ckpt_base + '/trial%d/ckpt.pth.tar' % args.trial ckpt = torch.load(ckpt_file) # Make "input mask" that tells the model which inputs were the same from before and should be copied oldinput_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() # This should update both the policy network agent.actor_critic.load_state_dict_special(ckpt['agent']['model'], oldinput_mask) # Set ob_rms old_rms = ckpt['ob_rms'] old_size = old_rms.mean.size if env_opt['add_timestep']: old_size -= 1 # Only copy the pro state part of it envs.ob_rms.mean[:old_size] = old_rms.mean[:old_size] envs.ob_rms.var[:old_size] = old_rms.var[:old_size] # Inline define our helper function for updating obs def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs # Reset our env and rollouts obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([alg_opt['num_processes'], 1]) final_rewards = torch.zeros([alg_opt['num_processes'], 1]) # Update loop start = time.time() for j in range(start_update, num_updates): for step in range(alg_opt['num_steps']): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs obs, reward, done, info = envs.step(cpu_actions) #pdb.set_trace() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) # Update model and rollouts with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Add algo updates here alg_info = {} alg_info['value_loss'] = value_loss alg_info['action_loss'] = action_loss alg_info['dist_entropy'] = dist_entropy alg_logger.writerow(alg_info) alg_f.flush() # Save checkpoints total_num_steps = (j + 1) * alg_opt['num_processes'] * alg_opt['num_steps'] #save_interval = log_opt['save_interval'] * alg_opt['log_mult'] save_interval = 100 if j % save_interval == 0: # Save all of our important information save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Print log log_interval = log_opt['log_interval'] * alg_opt['log_mult'] if j % log_interval == 0: end = time.time() print( "{}: Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(options['logs']['exp_name'], j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) # Do dashboard logging vis_interval = log_opt['vis_interval'] * alg_opt['log_mult'] if args.vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs dashboard.visdom_plot() except IOError: pass # Save final checkpoint save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Close logging file alg_f.close()