def __init__(self, base_name, config): a2c_common.DiscreteA2CBase.__init__(self, base_name, config) obs_shape = self.obs_shape config = { 'actions_num': self.actions_num, 'input_shape': obs_shape, 'num_seqs': self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size', 1) } self.model = self.network.build(config) self.model.to(self.ppo_device) self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay) if self.normalize_input: if isinstance(self.observation_space, gym.spaces.Dict): self.running_mean_std = RunningMeanStdObs(obs_shape).to( self.ppo_device) else: self.running_mean_std = RunningMeanStd(obs_shape).to( self.ppo_device) if self.has_central_value: cv_config = { 'state_shape': self.state_shape, 'value_size': self.value_size, 'ppo_device': self.ppo_device, 'num_agents': self.num_agents, 'num_steps': self.steps_num, 'num_actors': self.num_actors, 'num_actions': self.actions_num, 'seq_len': self.seq_len, 'model': self.central_value_config['network'], 'config': self.central_value_config, 'writter': self.writer, 'multi_gpu': self.multi_gpu } self.central_value_net = central_value.CentralValueTrain( **cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', False) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) self.algo_observer.after_init(self)
def __init__(self, base_name, params): a2c_common.ContinuousA2CBase.__init__(self, base_name, params) obs_shape = self.obs_shape build_config = { 'actions_num' : self.actions_num, 'input_shape' : obs_shape, 'num_seqs' : self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size',1), 'normalize_value' : self.normalize_value, 'normalize_input': self.normalize_input, } self.model = self.network.build(build_config) self.model.to(self.ppo_device) self.states = None if self.ewma_ppo: self.ewma_model = EwmaModel(self.model, ewma_decay=0.889) self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.bound_loss_type = self.config.get('bound_loss_type', 'bound') # 'regularisation' or 'bound' self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay) if self.has_central_value: cv_config = { 'state_shape' : self.state_shape, 'value_size' : self.value_size, 'ppo_device' : self.ppo_device, 'num_agents' : self.num_agents, 'horizon_length' : self.horizon_length, 'num_actors' : self.num_actors, 'num_actions' : self.actions_num, 'seq_len' : self.seq_len, 'normalize_value' : self.normalize_value, 'network' : self.central_value_config['network'], 'config' : self.central_value_config, 'writter' : self.writer, 'max_epochs' : self.max_epochs, 'multi_gpu' : self.multi_gpu } self.central_value_net = central_value.CentralValueTrain(**cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', True) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) if self.normalize_value: self.value_mean_std = self.central_value_net.model.value_mean_std if self.has_central_value else self.model.value_mean_std if 'phasic_policy_gradients' in self.config: self.has_phasic_policy_gradients = True self.ppg_aux_loss = ppg_aux.PPGAux(self, self.config['phasic_policy_gradients']) self.has_value_loss = (self.has_central_value and self.use_experimental_cv) \ or (not self.has_phasic_policy_gradients and not self.has_central_value) self.algo_observer.after_init(self)
def __init__(self, state_shape, value_size, ppo_device, num_agents, num_steps, num_actors, num_actions, seq_len, model, config, writter, multi_gpu): nn.Module.__init__(self) self.ppo_device = ppo_device self.num_agents, self.num_steps, self.num_actors, self.seq_len = num_agents, num_steps, num_actors, seq_len self.num_actions = num_actions self.state_shape = state_shape self.value_size = value_size self.multi_gpu = multi_gpu self.truncate_grads = config.get('truncate_grads', False) state_config = { 'value_size' : value_size, 'input_shape' : state_shape, 'actions_num' : num_actions, 'num_agents' : num_agents, 'num_seqs' : num_actors } self.config = config self.model = model.build('cvalue', **state_config) self.lr = config['lr'] self.mini_epoch = config['mini_epochs'] self.mini_batch = config['minibatch_size'] self.num_minibatches = self.num_steps * self.num_actors // self.mini_batch self.clip_value = config['clip_value'] self.normalize_input = config['normalize_input'] self.writter = writter self.use_joint_obs_actions = config.get('use_joint_obs_actions', False) self.weight_decay = config.get('weight_decay', 0.0) self.optimizer = torch.optim.Adam(self.model.parameters(), float(self.lr), eps=1e-08, weight_decay=self.weight_decay) self.frame = 0 self.running_mean_std = None self.grad_norm = config.get('grad_norm', 1) self.truncate_grads = config.get('truncate_grads', False) self.e_clip = config.get('e_clip', 0.2) self.truncate_grad = self.config.get('truncate_grads', False) if self.normalize_input: self.running_mean_std = RunningMeanStd(state_shape) self.is_rnn = self.model.is_rnn() self.rnn_states = None self.batch_size = self.num_steps * self.num_actors if self.is_rnn: self.rnn_states = self.model.get_default_rnn_state() self.rnn_states = [s.to(self.ppo_device) for s in self.rnn_states] num_seqs = self.num_steps * self.num_actors // self.seq_len assert((self.num_steps * self.num_actors // self.num_minibatches) % self.seq_len == 0) self.mb_rnn_states = [torch.zeros((s.size()[0], num_seqs, s.size()[2]), dtype = torch.float32, device=self.ppo_device) for s in self.rnn_states] self.dataset = datasets.PPODataset(self.batch_size, self.mini_batch, True, self.is_rnn, self.ppo_device, self.seq_len)
def __init__(self, base_name, config): a2c_common.ContinuousA2CBase.__init__(self, base_name, config) obs_shape = torch_ext.shape_whc_to_cwh(self.obs_shape) config = { 'actions_num': self.actions_num, 'input_shape': obs_shape, 'num_seqs': self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size', 1) } self.model = self.network.build(config) self.model.to(self.ppo_device) self.states = None self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-07, weight_decay=self.weight_decay) if self.normalize_input: self.running_mean_std = RunningMeanStd(obs_shape).to( self.ppo_device) if self.has_central_value: cv_config = { 'state_shape': torch_ext.shape_whc_to_cwh(self.state_shape), 'value_size': self.value_size, 'ppo_device': self.ppo_device, 'num_agents': self.num_agents, 'num_steps': self.steps_num, 'num_actors': self.num_actors, 'num_actions': self.actions_num, 'seq_len': self.seq_len, 'model': self.central_value_config['network'], 'config': self.central_value_config, 'writter': self.writer } self.central_value_net = central_value.CentralValueTrain( **cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', True) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) self.algo_observer.after_init(self)
def __init__(self, state_shape, value_size, ppo_device, num_agents, horizon_length, num_actors, num_actions, seq_len, normalize_value, network, config, writter, max_epochs, multi_gpu): nn.Module.__init__(self) self.ppo_device = ppo_device self.num_agents, self.horizon_length, self.num_actors, self.seq_len = num_agents, horizon_length, num_actors, seq_len self.normalize_value = normalize_value self.num_actions = num_actions self.state_shape = state_shape self.value_size = value_size self.max_epochs = max_epochs self.multi_gpu = multi_gpu self.truncate_grads = config.get('truncate_grads', False) self.config = config self.normalize_input = config['normalize_input'] state_config = { 'value_size': value_size, 'input_shape': state_shape, 'actions_num': num_actions, 'num_agents': num_agents, 'num_seqs': num_actors, 'normalize_input': self.normalize_input, 'normalize_value': self.normalize_value, } self.model = network.build(state_config) self.lr = float(config['learning_rate']) self.linear_lr = config.get('lr_schedule') == 'linear' if self.linear_lr: self.scheduler = schedulers.LinearScheduler( self.lr, max_steps=self.max_epochs, apply_to_entropy=False, start_entropy_coef=0) else: self.scheduler = schedulers.IdentityScheduler() self.mini_epoch = config['mini_epochs'] self.mini_batch = config['minibatch_size'] self.num_minibatches = self.horizon_length * self.num_actors // self.mini_batch self.clip_value = config['clip_value'] self.writter = writter self.weight_decay = config.get('weight_decay', 0.0) self.optimizer = torch.optim.Adam(self.model.parameters(), float(self.lr), eps=1e-08, weight_decay=self.weight_decay) self.frame = 0 self.epoch_num = 0 self.running_mean_std = None self.grad_norm = config.get('grad_norm', 1) self.truncate_grads = config.get('truncate_grads', False) self.e_clip = config.get('e_clip', 0.2) self.truncate_grad = self.config.get('truncate_grads', False) self.is_rnn = self.model.is_rnn() self.rnn_states = None self.batch_size = self.horizon_length * self.num_actors if self.is_rnn: self.rnn_states = self.model.get_default_rnn_state() self.rnn_states = [s.to(self.ppo_device) for s in self.rnn_states] total_agents = self.num_actors #* self.num_agents num_seqs = self.horizon_length // self.seq_len assert ( (self.horizon_length * total_agents // self.num_minibatches) % self.seq_len == 0) self.mb_rnn_states = [ torch.zeros((num_seqs, s.size()[0], total_agents, s.size()[2]), dtype=torch.float32, device=self.ppo_device) for s in self.rnn_states ] self.dataset = datasets.PPODataset(self.batch_size, self.mini_batch, True, self.is_rnn, self.ppo_device, self.seq_len)