def act(self, obs): with tx.device_scope(self.gpu_ids): if self.sleep_time > 0.0: time.sleep(self.sleep_time) if not self.frame_stack_concatenate_on_env: # Output pixels of environment is a list of frames, # we concatenate the frames into a single numpy array obs = copy.deepcopy(obs) if 'pixel' in obs: for key in obs['pixel']: obs['pixel'][key] = np.concatenate(obs['pixel'][key], axis=0) # Convert to pytorch tensor obs_tensor = collections.OrderedDict() for modality in obs: modality_dict = collections.OrderedDict() for key in obs[modality]: modality_dict[key] = torch.tensor( obs[modality][key], dtype=torch.float32).unsqueeze(0) obs_tensor[modality] = modality_dict action, _ = self.model(obs_tensor, calculate_value=False) if self.param_noise and self.param_noise_type == 'adaptive_normal': self.param_noise.compute_action_distance(obs_tensor, action) action = action.data.cpu().numpy()[0] action = action.clip(-1, 1) if self.agent_mode != 'eval_deterministic': action += self.noise() action = action.clip(-1, 1) return action
def mytest_data_parallel(devices): """ Make a new process because otherwise cannot empty nvidia-smi pytest doesn't work with multiprocessing """ dtype = torch.double with tx.device_scope(devices, dtype=dtype): net = MyNet(19, 78, 25) x = torch.empty(64, 19).uniform_(0, 0.1) y = torch.randn(64, 25) z = y - net(x) devices = tx.ids_to_devices(devices) assert z.device == devices[0] assert z.dtype == dtype # all GPUs without DataParallel allocated should report 0 memory usage # os.system('nvidia-smi') should_have_mem = [tx.device_to_int(d) for d in devices] actual_mems = tx.cuda_memory('all', mode='cache', unit='kb') print('IDs', should_have_mem, '\tmemory:', actual_mems) for i, actual_mem in enumerate(actual_mems): if i in should_have_mem: assert actual_mem > 0, ('device', i, actual_mem) else: assert actual_mem == 0, ('device', i, actual_mem)
def act(self, obs): ''' Agent returns an action based on input observation. if in training, returns action along with action infos, which includes the current probability distribution, RNN hidden states and etc. Args: obs: numpy array of (1, obs_dim) Returns: action_choice: sampled or max likelihood action to input to env action_info: list of auxiliary information - [onetime, persistent] Note: this includes probability distribution the action is sampled from, RNN hidden states ''' # Note: we collect two kinds of action infos, one persistent one onetime # persistent info is collected for every step in rollout (i.e. policy probability distribution) # onetime info is collected for the first step in partial trajectory (i.e. RNN hidden state) # see ExpSenderWrapperMultiStepMovingWindowWithInfo in exp_sender_wrapper for more action_info = [[], []] with tx.device_scope(self.gpu_ids): obs_tensor = {} for mod in obs.keys(): obs_tensor[mod] = {} for k in obs[mod].keys(): obs_tensor[mod][k] = torch.tensor( obs[mod][k], dtype=torch.float32).unsqueeze(0) if self.rnn_config.if_rnn_policy: action_info[0].append(self.cells[0].squeeze(1).cpu().numpy()) action_info[0].append(self.cells[1].squeeze(1).cpu().numpy()) action_pd, self.cells = self.model.forward_actor_expose_cells( obs_tensor, self.cells) action_pd = action_pd.detach().cpu().numpy() action_pd[:, self.action_dim:] *= np.exp(self.noise) if self.agent_mode != 'eval_deterministic': action_choice = self.pd.sample(action_pd) else: action_choice = self.pd.maxprob(action_pd) np.clip(action_choice, -1, 1, out=action_choice) action_choice = action_choice.reshape((-1, )) action_pd = action_pd.reshape((-1, )) action_info[1].append(action_pd) if self.env_config.action_spec['type'] == 'discrete': action_choice = np.argmax(action_choice) if self.agent_mode != 'training': return action_choice else: time.sleep(self.env_config.sleep_time) return action_choice, action_info
def preprocess(self, batch): ''' Override for learner/base/preprocess. Before learn() is called, preprocess() takes the batch and converts the numpy arrays to pytorch tensors. Note that this operation will transfer the data to gpu if a gpu is used. Arguments: batch: a batch of numpy arrays from the replay memory ''' # Convert all numpy arrays to pytorch tensors, and transfers to gpu if applicable with tx.device_scope(self.gpu_ids): obs, actions, rewards, obs_next, done = (batch['obs'], batch['actions'], batch['rewards'], batch['obs_next'], batch['dones']) device_name = 'cpu' if self._num_gpus > 0: device_name = 'cuda' for modality in obs: for key in obs[modality]: if modality == 'pixel': obs[modality][key] = (torch.tensor( obs[modality][key], dtype=torch.uint8).to( torch.device(device_name))).float().detach() else: obs[modality][key] = (torch.tensor( obs[modality][key], dtype=torch.float32).to( torch.device(device_name))).detach() for modality in obs_next: for key in obs_next[modality]: if modality == 'pixel': obs_next[modality][key] = (torch.tensor( obs_next[modality][key], dtype=torch.uint8).to( torch.device(device_name))).float().detach() else: obs_next[modality][key] = (torch.tensor( obs_next[modality][key], dtype=torch.float32).to( torch.device(device_name))).detach() actions = torch.tensor(actions, dtype=torch.float32).to( torch.device(device_name)) rewards = torch.tensor(rewards, dtype=torch.float32).to( torch.device(device_name)) done = torch.tensor(done, dtype=torch.float32).to( torch.device(device_name)) (batch['obs'], batch['actions'], batch['rewards'], batch['obs_next'], batch['dones']) = (obs, actions, rewards, obs_next, done) return batch
def reset(self): ''' reset of LSTM hidden and cell states ''' if self.rnn_config.if_rnn_policy: # Note that .detach() is necessary here to prevent overflow of memory # otherwise rollout in length of thousands will prevent previously # accumulated hidden/cell states from being freed. with tx.device_scope(self.gpu_ids): self.cells = (torch.zeros(self.rnn_config.rnn_layer, 1, # batch_size is 1 self.rnn_config.rnn_hidden).detach(), torch.zeros(self.rnn_config.rnn_layer, 1, # batch_size is 1 self.rnn_config.rnn_hidden).detach())
def __init__(self, learner_config, env_config, session_config, agent_id, agent_mode, render=False): super().__init__( learner_config=learner_config, env_config=env_config, session_config=session_config, agent_id=agent_id, agent_mode=agent_mode, render=render, ) self.action_dim = self.env_config.action_spec.dim[0] self.obs_spec = self.env_config.obs_spec self.use_z_filter = self.learner_config.algo.use_z_filter self.init_log_sig = self.learner_config.algo.consts.init_log_sig self.log_sig_range = self.learner_config.algo.consts.log_sig_range # setting agent mode if self.agent_mode != 'training': if self.env_config.stochastic_eval: self.agent_mode = 'eval_stochastic' else: self.agent_mode = 'eval_deterministic' if self.agent_mode != 'training': self.noise = 0 else: self.noise = np.random.uniform(low=-self.log_sig_range, high=self.log_sig_range) self.rnn_config = self.learner_config.algo.rnn # GPU setup # TODO: deprecate self._num_gpus = session_config.agent.num_gpus if torch.cuda.is_available(): self.gpu_ids = 'cuda:all' self.log.info('PPO agent is using GPU') # Note that user is responsible for only providing one GPU for the program self.log.info('cudnn version: {}'.format( torch.backends.cudnn.version())) torch.backends.cudnn.benchmark = True else: self.gpu_ids = 'cpu' self.log.info('PPO agent is using CPU') self.pd = DiagGauss(self.action_dim) self.cells = None with tx.device_scope(self.gpu_ids): if self.rnn_config.if_rnn_policy: # Note that .detach() is necessary here to prevent overflow of memory # otherwise rollout in length of thousands will prevent previously # accumulated hidden/cell states from being freed. self.cells = ( torch.zeros( self.rnn_config.rnn_layer, 1, # batch_size is 1 self.rnn_config.rnn_hidden).detach(), torch.zeros( self.rnn_config.rnn_layer, 1, # batch_size is 1 self.rnn_config.rnn_hidden).detach()) self.model = PPOModel( obs_spec=self.obs_spec, action_dim=self.action_dim, model_config=self.learner_config.model, use_cuda=False, init_log_sig=self.init_log_sig, use_z_filter=self.use_z_filter, if_pixel_input=self.env_config.pixel_input, rnn_config=self.rnn_config, )
def __init__(self, learner_config, env_config, session_config): super().__init__(learner_config, env_config, session_config) self.current_iteration = 0 # load multiple optimization instances onto a single gpu self.batch_size = self.learner_config.replay.batch_size self.discount_factor = self.learner_config.algo.gamma self.n_step = self.learner_config.algo.n_step self.is_pixel_input = self.env_config.pixel_input self.use_layernorm = self.learner_config.model.use_layernorm self.use_double_critic = self.learner_config.algo.network.use_double_critic self.use_action_regularization = self.learner_config.algo.network.use_action_regularization self.frame_stack_concatenate_on_env = self.env_config.frame_stack_concatenate_on_env self.log.info('Initializing DDPG learner') self._num_gpus = session_config.learner.num_gpus if not torch.cuda.is_available(): self.gpu_ids = 'cpu' self.log.info('Using CPU') else: self.gpu_ids = 'cuda:all' self.log.info('Using GPU') self.log.info('cudnn version: {}'.format( torch.backends.cudnn.version())) torch.backends.cudnn.benchmark = True self._num_gpus = 1 with tx.device_scope(self.gpu_ids): self._target_update_init() self.clip_actor_gradient = self.learner_config.algo.network.clip_actor_gradient if self.clip_actor_gradient: self.actor_gradient_clip_value = self.learner_config.algo.network.actor_gradient_value_clip self.log.info('Clipping actor gradient at {}'.format( self.actor_gradient_clip_value)) self.clip_critic_gradient = self.learner_config.algo.network.clip_critic_gradient if self.clip_critic_gradient: self.critic_gradient_clip_value = self.learner_config.algo.network.critic_gradient_value_clip self.log.info('Clipping critic gradient at {}'.format( self.critic_gradient_clip_value)) self.action_dim = self.env_config.action_spec.dim[0] self.model = DDPGModel( obs_spec=self.env_config.obs_spec, action_dim=self.action_dim, use_layernorm=self.use_layernorm, actor_fc_hidden_sizes=self.learner_config.model. actor_fc_hidden_sizes, critic_fc_hidden_sizes=self.learner_config.model. critic_fc_hidden_sizes, conv_out_channels=self.learner_config.model.conv_spec. out_channels, conv_kernel_sizes=self.learner_config.model.conv_spec. kernel_sizes, conv_strides=self.learner_config.model.conv_spec.strides, conv_hidden_dim=self.learner_config.model.conv_spec. hidden_output_dim, ) self.model_target = DDPGModel( obs_spec=self.env_config.obs_spec, action_dim=self.action_dim, use_layernorm=self.use_layernorm, actor_fc_hidden_sizes=self.learner_config.model. actor_fc_hidden_sizes, critic_fc_hidden_sizes=self.learner_config.model. critic_fc_hidden_sizes, conv_out_channels=self.learner_config.model.conv_spec. out_channels, conv_kernel_sizes=self.learner_config.model.conv_spec. kernel_sizes, conv_strides=self.learner_config.model.conv_spec.strides, conv_hidden_dim=self.learner_config.model.conv_spec. hidden_output_dim, ) if self.use_double_critic: self.model2 = DDPGModel( obs_spec=self.env_config.obs_spec, action_dim=self.action_dim, use_layernorm=self.use_layernorm, actor_fc_hidden_sizes=self.learner_config.model. actor_fc_hidden_sizes, critic_fc_hidden_sizes=self.learner_config.model. critic_fc_hidden_sizes, conv_out_channels=self.learner_config.model.conv_spec. out_channels, conv_kernel_sizes=self.learner_config.model.conv_spec. kernel_sizes, conv_strides=self.learner_config.model.conv_spec.strides, conv_hidden_dim=self.learner_config.model.conv_spec. hidden_output_dim, critic_only=True, ) self.model_target2 = DDPGModel( obs_spec=self.env_config.obs_spec, action_dim=self.action_dim, use_layernorm=self.use_layernorm, actor_fc_hidden_sizes=self.learner_config.model. actor_fc_hidden_sizes, critic_fc_hidden_sizes=self.learner_config.model. critic_fc_hidden_sizes, conv_out_channels=self.learner_config.model.conv_spec. out_channels, conv_kernel_sizes=self.learner_config.model.conv_spec. kernel_sizes, conv_strides=self.learner_config.model.conv_spec.strides, conv_hidden_dim=self.learner_config.model.conv_spec. hidden_output_dim, critic_only=True, ) self.critic_criterion = nn.MSELoss() self.log.info('Using Adam for critic with learning rate {}'.format( self.learner_config.algo.network.lr_critic)) self.critic_optim = torch.optim.Adam( self.model.get_critic_parameters(), lr=self.learner_config.algo.network.lr_critic, weight_decay=self.learner_config.algo.network. critic_regularization # Weight regularization term ) self.log.info('Using Adam for actor with learning rate {}'.format( self.learner_config.algo.network.lr_actor)) self.actor_optim = torch.optim.Adam( self.model.get_actor_parameters(), lr=self.learner_config.algo.network.lr_actor, weight_decay=self.learner_config.algo.network. actor_regularization # Weight regularization term ) if self.use_double_critic: self.log.info( 'Using Adam for critic with learning rate {}'.format( self.learner_config.algo.network.lr_critic)) self.critic_optim2 = torch.optim.Adam( self.model2.get_critic_parameters(), lr=self.learner_config.algo.network.lr_critic, weight_decay=self.learner_config.algo.network. critic_regularization # Weight regularization term ) self.log.info('Using {}-step bootstrapped return'.format( self.learner_config.algo.n_step)) self.frame_stack_preprocess = FrameStackPreprocessor( self.env_config.frame_stacks) self.aggregator = SSARAggregator(self.env_config.obs_spec, self.env_config.action_spec) self.model_target.actor.hard_update(self.model.actor) self.model_target.critic.hard_update(self.model.critic) if self.use_double_critic: self.model_target2.critic.hard_update(self.model2.critic) self.total_learn_time = U.TimeRecorder() self.forward_time = U.TimeRecorder() self.critic_update_time = U.TimeRecorder() self.actor_update_time = U.TimeRecorder()
def _optimize(self, obs, actions, rewards, obs_next, done): ''' Note that while the replay contains uint8, the aggregator returns float32 tensors Arguments: obs: an observation from the minibatch, often represented as s_n in literature. Dimensionality: (N, C) for low dimensional inputs, (N, C, H, W) for pixel inputs actions: actions taken given observations obs, often represented as a_n in literature. Dimensionality: (N, A), where A is the dimensionality of a single action rewards: rewards received after action is taken. Dimensionality: N obs_next: an observation from the minibatch, often represented as s_{n+1} in literature done: 1 if obs_next is terminal, 0 otherwise. Dimensionality: N ''' with tx.device_scope(self.gpu_ids): with self.forward_time.time(): assert actions.max().item() <= 1.0 assert actions.min().item() >= -1.0 # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a)) model_policy, next_Q_target = self.model_target.forward( obs_next) if self.use_action_regularization: # https://github.com/sfujim/TD3/blob/master/TD3.py -- action regularization policy_noise = 0.2 noise_clip = 0.5 batch_size = self.batch_size noise = np.clip( np.random.normal(0, policy_noise, size=(batch_size, self.action_dim)), -noise_clip, noise_clip) device_name = 'cpu' if self._num_gpus > 0: device_name = 'cuda' model_policy += torch.tensor( noise, dtype=torch.float32).to(device_name).detach() model_policy = model_policy.clamp(-1, 1).to(device_name) y = rewards + pow(self.discount_factor, self.n_step) * next_Q_target * (1.0 - done) if self.use_double_critic: _, next_Q_target2 = self.model_target2.forward( obs_next, action=model_policy) y2 = rewards + pow(self.discount_factor, self.n_step ) * next_Q_target2 * (1.0 - done) y = torch.min(y, y2) y = y.detach() # compute Q(s_t, a_t) perception = self.model.forward_perception(obs) y_policy = self.model.forward_critic(perception, actions.detach()) y_policy2 = None if self.use_double_critic: perception2 = self.model2.forward_perception(obs) y_policy2 = self.model2.forward_critic( perception2, actions.detach()) # critic update with self.critic_update_time.time(): self.model.critic.zero_grad() if self.is_pixel_input: self.model.perception.zero_grad() critic_loss = self.critic_criterion(y_policy, y) critic_loss.backward() if self.clip_critic_gradient: self.model.critic.clip_grad_value( self.critic_gradient_clip_value) self.critic_optim.step() if self.use_double_critic: self.model2.critic.zero_grad() if self.is_pixel_input: self.model2.perception.zero_grad() critic_loss = self.critic_criterion(y_policy2, y) critic_loss.backward() if self.clip_critic_gradient: self.model2.critic.clip_grad_value( self.critic_gradient_clip_value) self.critic_optim2.step() # actor update with self.actor_update_time.time(): self.model.actor.zero_grad() actor_loss = -self.model.forward_critic( perception.detach(), self.model.forward_actor(perception.detach())) actor_loss = actor_loss.mean() actor_loss.backward() if self.clip_actor_gradient: self.model.actor.clip_grad_value( self.actor_gradient_clip_value) self.actor_optim.step() tensorplex_update_dict = { 'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item(), 'action_norm': actions.norm(2, 1).mean().item(), 'rewards': rewards.mean().item(), 'Q_target': y.mean().item(), 'Q_policy': y_policy.mean().item(), 'performance/forward_time': self.forward_time.avg, 'performance/critic_update_time': self.critic_update_time.avg, 'performance/actor_update_time': self.actor_update_time.avg, } if self.use_double_critic: tensorplex_update_dict['Q_policy2'] = y_policy2.mean().item() # (possibly) update target networks self._target_update() return tensorplex_update_dict
def new_tensor(device_id, value): with tx.device_scope(device_id, dtype=torch.float64): return torch.ones(SHAPE) * value
def __init__(self, learner_config, env_config, session_config, agent_id, agent_mode, render=False): ''' Constructor for DDPGAgent class. Important attributes: learner_config, env_config, session_config: experiment configurations agent_id: unique id in the range [0, num_agents) agent_mode: toggles between agent noise and deterministic behavior ''' super().__init__( learner_config=learner_config, env_config=env_config, session_config=session_config, agent_id=agent_id, agent_mode=agent_mode, render=render, ) self.agent_id = agent_id self.action_dim = self.env_config.action_spec.dim[0] self.obs_spec = self.env_config.obs_spec self.use_layernorm = self.learner_config.model.use_layernorm self.sleep_time = self.env_config.sleep_time self.param_noise = None self.param_noise_type = self.learner_config.algo.exploration.param_noise_type self.param_noise_sigma = self.learner_config.algo.exploration.param_noise_sigma self.param_noise_alpha = self.learner_config.algo.exploration.param_noise_alpha self.param_noise_target_stddev = self.learner_config.algo.exploration.param_noise_target_stddev self.frame_stack_concatenate_on_env = self.env_config.frame_stack_concatenate_on_env self.noise_type = self.learner_config.algo.exploration.noise_type if env_config.num_agents == 1: # If only one agent, we don't want a sigma of 0 self.sigma = self.learner_config.algo.exploration.max_sigma / 3.0 else: self.sigma = self.learner_config.algo.exploration.max_sigma * ( float(agent_id) / (env_config.num_agents)) #self.sigma = self.learner_config.algo.exploration.sigma print('Using exploration sigma', self.sigma) if torch.cuda.is_available(): self.gpu_ids = 'cuda:all' self.log.info('DDPG agent is using GPU') # Note that user is responsible for only providing one GPU for the program self.log.info('cudnn version: {}'.format( torch.backends.cudnn.version())) torch.backends.cudnn.benchmark = True else: self.gpu_ids = 'cpu' self.log.info('DDPG agent is using CPU') with tx.device_scope(self.gpu_ids): self.model = DDPGModel( obs_spec=self.obs_spec, action_dim=self.action_dim, use_layernorm=self.use_layernorm, actor_fc_hidden_sizes=self.learner_config.model. actor_fc_hidden_sizes, critic_fc_hidden_sizes=self.learner_config.model. critic_fc_hidden_sizes, conv_out_channels=self.learner_config.model.conv_spec. out_channels, conv_kernel_sizes=self.learner_config.model.conv_spec. kernel_sizes, conv_strides=self.learner_config.model.conv_spec.strides, conv_hidden_dim=self.learner_config.model.conv_spec. hidden_output_dim, ) self.model.eval() self._init_noise()
def __init__(self, learner_config, env_config, session_config): super().__init__(learner_config, env_config, session_config) # GPU setting self.current_iteration = 0 self.global_step = 0 if not torch.cuda.is_available(): self.gpu_option = 'cpu' else: self.gpu_option = 'cuda:all' self.use_cuda = torch.cuda.is_available() if not self.use_cuda: self.log.info('Using CPU') else: self.log.info('Using GPU: {}'.format(self.gpu_option)) # RL general parameters self.gamma = self.learner_config.algo.gamma self.lam = self.learner_config.algo.advantage.lam self.n_step = self.learner_config.algo.n_step self.use_z_filter = self.learner_config.algo.use_z_filter self.use_r_filter = self.learner_config.algo.use_r_filter self.norm_adv = self.learner_config.algo.advantage.norm_adv self.batch_size = self.learner_config.replay.batch_size self.action_dim = self.env_config.action_spec.dim[0] self.obs_spec = self.env_config.obs_spec self.init_log_sig = self.learner_config.algo.consts.init_log_sig # PPO parameters self.ppo_mode = self.learner_config.algo.ppo_mode self.if_rnn_policy = self.learner_config.algo.rnn.if_rnn_policy self.horizon = self.learner_config.algo.rnn.horizon self.lr_actor = self.learner_config.algo.network.lr_actor self.lr_critic = self.learner_config.algo.network.lr_critic self.epoch_policy = self.learner_config.algo.consts.epoch_policy self.epoch_baseline = self.learner_config.algo.consts.epoch_baseline self.kl_target = self.learner_config.algo.consts.kl_target self.adjust_threshold = self.learner_config.algo.consts.adjust_threshold self.reward_scale = self.learner_config.algo.advantage.reward_scale # PPO mode 'adjust' self.kl_cutoff_coeff = self.learner_config.algo.adapt_consts.kl_cutoff_coeff self.beta_init = self.learner_config.algo.adapt_consts.beta_init self.beta_range = self.learner_config.algo.adapt_consts.beta_range # PPO mode 'clip' self.clip_range = self.learner_config.algo.clip_consts.clip_range self.clip_epsilon_init = self.learner_config.algo.clip_consts.clip_epsilon_init if self.ppo_mode == 'adapt': self.beta = self.beta_init self.eta = self.kl_cutoff_coeff self.beta_upper = self.beta_range[1] self.beta_lower = self.beta_range[0] self.beta_adjust_threshold = self.adjust_threshold else: # method == 'clip' self.clip_epsilon = self.clip_epsilon_init self.clip_adjust_threshold = self.adjust_threshold self.clip_upper = self.clip_range[1] self.clip_lower = self.clip_range[0] # learning rate setting: self.min_lr = self.learner_config.algo.network.anneal.min_lr self.lr_update_frequency = self.learner_config.algo.network.anneal.lr_update_frequency self.frames_to_anneal = self.learner_config.algo.network.anneal.frames_to_anneal num_updates = int(self.frames_to_anneal / self.learner_config.parameter_publish.exp_interval) lr_scheduler = eval( self.learner_config.algo.network.anneal.lr_scheduler) self.exp_counter = 0 self.kl_record = [] with tx.device_scope(self.gpu_option): self.model = PPOModel( obs_spec=self.obs_spec, action_dim=self.action_dim, model_config=self.learner_config.model, use_cuda=self.use_cuda, init_log_sig=self.init_log_sig, use_z_filter=self.use_z_filter, if_pixel_input=self.env_config.pixel_input, rnn_config=self.learner_config.algo.rnn, ) self.ref_target_model = PPOModel( obs_spec=self.obs_spec, action_dim=self.action_dim, model_config=self.learner_config.model, use_cuda=self.use_cuda, init_log_sig=self.init_log_sig, use_z_filter=self.use_z_filter, if_pixel_input=self.env_config.pixel_input, rnn_config=self.learner_config.algo.rnn, ) self.ref_target_model.update_target_params(self.model) # Learning parameters and optimizer self.clip_actor_gradient = self.learner_config.algo.network.clip_actor_gradient self.actor_gradient_clip_value = self.learner_config.algo.network.actor_gradient_norm_clip self.clip_critic_gradient = self.learner_config.algo.network.clip_critic_gradient self.critic_gradient_clip_value = self.learner_config.algo.network.critic_gradient_norm_clip self.critic_optim = torch.optim.Adam( self.model.get_critic_params(), lr=self.lr_critic, weight_decay=self.learner_config.algo.network. critic_regularization) self.actor_optim = torch.optim.Adam( self.model.get_actor_params(), lr=self.lr_actor, weight_decay=self.learner_config.algo.network. actor_regularization) # learning rate scheduler self.actor_lr_scheduler = lr_scheduler( self.actor_optim, num_updates, update_freq=self.lr_update_frequency, min_lr=self.min_lr) self.critic_lr_scheduler = lr_scheduler( self.critic_optim, num_updates, update_freq=self.lr_update_frequency, min_lr=self.min_lr) # Experience Aggregator self.aggregator = MultistepAggregatorWithInfo( self.env_config.obs_spec, self.env_config.action_spec) # probability distribution. Gaussian only for now self.pd = DiagGauss(self.action_dim) # placeholder for RNN hidden cells self.cells = None # Reward White-filtering if self.use_r_filter: self.reward_filter = RewardFilter()
def _optimize(self, obs, actions, rewards, obs_next, persistent_infos, onetime_infos, dones): ''' main method for optimization that calls _adapt/clip_update and _value_update epoch_policy and epoch_baseline times respectively return: dictionary of tracted statistics Args: obs: batch of observations (batch_size, N-step , obs_dim) obs_next: batch of next observations (batch_size, 1 , obs_dim) actions: batch of actions (batch_size, N-step , act_dim) rewards: batch of rewards (batch_size, N-step) dones: batch of termination flags (batch_size, N-step) action_infos: list of batched other attributes tracted, such as behavior policy, RNN hidden states and etc. Returns: dictionary of recorded statistics ''' # convert everything to float tensor: with tx.device_scope(self.gpu_option): pds = persistent_infos[-1] if self.if_rnn_policy: h = (onetime_infos[0].transpose(0, 1).contiguous()).detach() c = (onetime_infos[1].transpose(0, 1).contiguous()).detach() self.cells = (h, c) advantages, returns = self._gae_and_return(obs, obs_next, rewards, dones) advantages = advantages.detach() returns = returns.detach() if self.if_rnn_policy: h = self.cells[0].detach() c = self.cells[1].detach() self.cells = (h, c) eff_len = self.n_step - self.horizon + 1 behave_pol = pds[:, :eff_len, :].contiguous().detach() actions_iter = actions[:, :eff_len, :].contiguous().detach() else: behave_pol = pds[:, 0, :].contiguous().detach() actions_iter = actions[:, 0, :].contiguous().detach() obs_iter = {} for mod in obs.keys(): obs_iter[mod] = {} for k in obs[mod].keys(): if self.if_rnn_policy: obs_iter[mod][k] = obs[mod][k][:, :self.n_step - self.horizon + 1, :].contiguous( ).detach() else: obs_iter[mod][k] = obs[mod][k][:, 0, :].contiguous( ).detach() ref_pol = self.ref_target_model.forward_actor( obs_iter, self.cells).detach() for ep in range(self.epoch_policy): if self.ppo_mode == 'clip': stats = self._clip_update(obs_iter, actions_iter, advantages, behave_pol) else: stats = self._adapt_update(obs_iter, actions_iter, advantages, behave_pol, ref_pol) curr_pol = self.model.forward_actor(obs_iter, self.cells).detach() kl = self.pd.kl(ref_pol, curr_pol).mean() stats['_pol_kl'] = kl.item() if kl.item() > self.kl_target * 4: break self.kl_record.append(stats['_pol_kl']) for _ in range(self.epoch_baseline): baseline_stats = self._value_update(obs_iter, returns) # Collecting metrics and updating tensorplex for k in baseline_stats: stats[k] = baseline_stats[k] behave_likelihood = self.pd.likelihood(actions_iter, behave_pol) curr_likelihood = self.pd.likelihood(actions_iter, curr_pol) stats['_avg_return_targ'] = returns.mean().item() stats['_avg_log_sig'] = self.model.actor.log_var.mean().item() stats['_avg_behave_likelihood'] = behave_likelihood.mean().item() stats['_avg_is_weight'] = ( curr_likelihood / (behave_likelihood + 1e-4)).mean().item() stats['_ref_behave_diff'] = self.pd.kl(ref_pol, behave_pol).mean().item() stats['_lr'] = self.actor_lr_scheduler.get_lr()[0] if self.use_z_filter: self.model.z_update(obs_iter) stats['obs_running_mean'] = np.mean( self.model.z_filter.running_mean()) stats['obs_running_square'] = np.mean( self.model.z_filter.running_square()) stats['obs_running_std'] = np.mean( self.model.z_filter.running_std()) if self.use_r_filter: stats['reward_mean'] = self.reward_filter.reward_mean() return stats
def _preprocess_batch_ppo(self, batch): ''' Loading experiences from numpy to torch.FloatTensor type Args: batch: BeneDict of experiences containing following attributes 'obs' - observation 'actions' - actions 'rewards' - rewards 'obs_next' - next observation 'persistent_infos' - action policy 'onetime_infos' - RNN hidden cells or None Return: Benedict of torch.FloatTensors ''' with tx.device_scope(self.gpu_option): obs, actions, rewards, obs_next, done, persistent_infos, onetime_infos = ( batch['obs'], batch['actions'], batch['rewards'], batch['obs_next'], batch['dones'], batch['persistent_infos'], batch['onetime_infos'], ) for modality in obs: for key in obs[modality]: obs[modality][key] = (torch.tensor( obs[modality][key], dtype=torch.float32)).detach() obs_next[modality][key] = (torch.tensor( obs_next[modality][key], dtype=torch.float32)).detach() actions = torch.tensor(actions, dtype=torch.float32) rewards = torch.tensor(rewards, dtype=torch.float32) * self.reward_scale if self.use_r_filter: normed_reward = self.reward_filter.forward(rewards) self.reward_filter.update(rewards) rewards = normed_reward done = torch.tensor(done, dtype=torch.float32) if persistent_infos is not None: for i in range(len(persistent_infos)): persistent_infos[i] = torch.tensor( persistent_infos[i], dtype=torch.float32).detach() if onetime_infos is not None: for i in range(len(onetime_infos)): onetime_infos[i] = torch.tensor( onetime_infos[i], dtype=torch.float32).detach() ( batch['obs'], batch['actions'], batch['rewards'], batch['obs_next'], batch['dones'], batch['persistent_infos'], batch['onetime_infos'], ) = (obs, actions, rewards, obs_next, done, persistent_infos, onetime_infos) return batch
def _gae_and_return(self, obs, obs_next, rewards, dones): ''' computes generalized advantage estimate and corresponding N-step return. Details of algorithm can be found here: https://arxiv.org/pdf/1506.02438.pdf Args: obs: batch of observations (batch_size, N-step , obs_dim) obs_next: batch of next observations (batch_size, 1 , obs_dim) actions: batch of actions (batch_size, N-step , act_dim) rewards: batch of rewards (batch_size, N-step) dones: batch of termination flags (batch_size, N-step) Returns: obs: batch of observation (batch_size, obs_dim) actions: batch of action (batch_size, act_dim) advantage: batch of advantages (batch_size, 1) returns: batch of returns (batch_size, 1) ''' with tx.device_scope(self.gpu_option): index_set = torch.tensor(range(self.n_step), dtype=torch.float32) gamma = torch.pow(self.gamma, index_set) lam = torch.pow(self.lam, index_set) obs_concat_var = {} for mod in obs.keys(): obs_concat_var[mod] = {} for k in obs[mod].keys(): obs_concat_var[mod][k] = (torch.cat( [obs[mod][k], obs_next[mod][k]], dim=1)) if not self.if_rnn_policy: obs_shape = obs_concat_var[mod][k].size() obs_concat_var[mod][k] = obs_concat_var[mod][k].view( -1, *obs_shape[2:]) values = self.model.forward_critic(obs_concat_var, self.cells) values = values.view(self.batch_size, self.n_step + 1) values[:, 1:] *= 1 - dones if self.if_rnn_policy: tds = rewards + self.gamma * values[:, 1:] - values[:, :-1] eff_len = self.n_step - self.horizon + 1 gamma = gamma[:self.horizon] lam = lam[:self.horizon] returns = torch.zeros(self.batch_size, eff_len) advs = torch.zeros(self.batch_size, eff_len) for step in range(eff_len): returns[:, step] = torch.sum(gamma * rewards[:, step:step + self.horizon], 1) + \ values[:, step + self.horizon] * (self.gamma ** self.horizon) advs[:, step] = torch.sum( tds[:, step:step + self.horizon] * gamma * lam, 1) if self.norm_adv: std = advs.std() mean = advs.mean() advs = (advs - mean) / max(std, 1e-4) return advs, returns else: returns = torch.sum( gamma * rewards, 1) + values[:, -1] * (self.gamma**self.n_step) tds = rewards + self.gamma * values[:, 1:] - values[:, :-1] gae = torch.sum(tds * gamma * lam, 1) if self.norm_adv: std = gae.std() mean = gae.mean() gae = (gae - mean) / max(std, 1e-4) return gae.view(-1, 1), returns.view(-1, 1)