def __load(self): # support self.support_dataset = Cifar100(config=self.config, file='base', mode='support', transform=transforms.Compose( [transforms.ToTensor()])) self.support_sampler = Sampler( labels=self.support_dataset.label, n_way=self.config['sampler']['train']['n_way'], k_samples=self.config['sampler']['train']['k_shot'], n_episodes=self.config['sampler']['train']['episodes']) self.support_dataloader = DataLoader( dataset=self.support_dataset, batch_sampler=self.support_sampler) # query self.query_dataset = Cifar100(config=self.config, file='base', mode='query', transform=transforms.Compose( [transforms.ToTensor()])) self.query_sampler = Sampler( labels=self.query_dataset.label, n_way=self.config['sampler']['train']['n_way'], k_samples=self.config['sampler']['train']['k_query'], n_episodes=self.config['sampler']['train']['episodes']) self.query_dataloader = DataLoader(dataset=self.query_dataset, batch_sampler=self.query_sampler)
def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim # includes skill in case env is option wrapped self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device=self.hparams.device ) # GMM policy with K mixtures, no reparametrization trick, regularization self.modules = [ "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target", self.vf_target ] # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init. # TODO remove device logic in Policy # Also the reason why wandb logger is not available self.batch_idx = None
def __init__(self): self.config = ConfigParser() self.config.read( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + os.sep + 'config' + os.sep + 'appConfig.ini') self.score = 0 self.model = None self.data = None self.labels = None self.sampler = Sampler()
def get_best_skill(self, policy, env, num_skills, max_path_length, n_paths=1): print('Finding best skill...') reward_list = [] with policy.deterministic(self.hparams.deterministic_eval): for z in range(num_skills): env.reset(state=None, skill=z) total_returns = 0 sampler = Sampler(env, max_path_length) for p in range(n_paths): new_paths = sampler.sample(max_path_length, policy) total_returns += new_paths[-1]['path_return'] print('Reward for skill %d = %.3f' % (z, total_returns)) reward_list.append(total_returns) best_z = np.argmax(reward_list) print('Best skill found: z = %d, reward = %d, seed = %d' % (best_z, reward_list[best_z], self.hparams.seed)) return best_z
class SAC(pl.LightningModule): def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim # includes skill in case env is option wrapped self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device=self.hparams.device ) # GMM policy with K mixtures, no reparametrization trick, regularization self.modules = [ "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target", self.vf_target ] # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init. # TODO remove device logic in Policy # Also the reason why wandb logger is not available self.batch_idx = None # torch.autograd.set_detect_anomaly(True) #TODO: disable if compute overhead def get_best_skill(self, policy, env, num_skills, max_path_length, n_paths=1): print('Finding best skill...') reward_list = [] with policy.deterministic(self.hparams.deterministic_eval): for z in range(num_skills): env.reset(state=None, skill=z) total_returns = 0 sampler = Sampler(env, max_path_length) for p in range(n_paths): new_paths = sampler.sample(max_path_length, policy) total_returns += new_paths[-1]['path_return'] print('Reward for skill %d = %.3f' % (z, total_returns)) reward_list.append(total_returns) best_z = np.argmax(reward_list) print('Best skill found: z = %d, reward = %d, seed = %d' % (best_z, reward_list[best_z], self.hparams.seed)) return best_z def on_sanity_check_start(self) -> None: self.pool.add_samples( self.sampler.sample(self.hparams.min_pool_size, self.policy)) print("Initialized Replay Buffer with %d samples" % self.pool.size) def __dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" dataset = RLDataset(self.pool, self.hparams.epoch_length, self.hparams.batch_size) # TODO: figure out why referencee codeee uses episode length abovee instead of batch size def _init_fn(worker_id): np.random.seed(self.hparams.seed + worker_id) dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers, worker_init_fn=_init_fn) return dataloader def train_dataloader(self) -> DataLoader: """Get train loader""" return self.__dataloader() def val_dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" dataset = RLDataset(self.pool, 1, 1) dataloader = DataLoader( dataset=dataset, batch_size=1, # num_workers=5 ) return dataloader # def _split_obs(self,t): # TODO remove from DIAYN, herf, and v2? # # TODO: verify that dim is 1, assert shape # return torch.split(t, [self._Do, self._num_skills], 1) def training_step(self, batch, batch_idx, optimizer_idx) -> OrderedDict: states, actions, rewards, dones, next_states = batch self.batch_idx = batch_idx # print(states[0], batch_idx) # print(self.pool.size,optimizer_idx,batch_idx,states[0]) # print("Running train",states.shape,batch_idx,optimizer_idx) # TODO: vars are already floatTensors. # Train Policy if optimizer_idx == 0: # for param in self.policy.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Done") # for param in self.vf.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Donevf") # print(torch.max(rewards),torch.min(rewards),torch.mean(rewards)) samples = self.sampler.sample( 1, self.policy) # TODO remove magic numbers self.pool.add_samples(samples) if samples[0]['done'] or samples[0][ 'path_length'] == self.hparams.max_path_length: self.max_path_return = max(self.max_path_return, samples[0]['path_return']) self.last_path_return = samples[0]['path_return'] distributions, action_samples, log_probs, corr, reg_loss = self.policy( states) assert log_probs.shape == torch.Size([action_samples.shape[0]]) # TODO: figure out why squash correction is not done in policy as kl_surrogate seems # to need uncorrected log probs? self.values = self.vf(states) # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes with torch.no_grad(): self.log_targets = self.qf(states, action_samples) self.scaled_log_pi = self._scale_entropy * (log_probs - corr) # How is this kl surrogate loss derived? self._kl_surrogate_loss = torch.mean( log_probs * (self.scaled_log_pi - self.log_targets + self.values.detach())) self._policy_loss = reg_loss + self._kl_surrogate_loss self._vf_loss = 0.5 * torch.mean( (self.values - self.log_targets + self.scaled_log_pi)**2) log = { 'max_path_return': self.max_path_return, 'train_loss': self._policy_loss.detach().cpu().numpy(), 'kl_loss': self._kl_surrogate_loss.detach().cpu().numpy(), 'reg_loss': reg_loss.detach().cpu().numpy(), 'gmm_means': torch.mean(distributions.component_distribution.mean).detach(). cpu().numpy(), 'gmm_sigmas': torch.mean(distributions.component_distribution.stddev).detach( ).cpu().numpy(), 'vf_loss': self._vf_loss.detach().cpu().numpy(), 'vf_value': torch.mean(self.values).detach().cpu().numpy(), 'scaled_log_pi': torch.mean(self.scaled_log_pi).detach().cpu().numpy() } status = { 'train_loss': self._policy_loss.detach().cpu().numpy(), # 'vf_loss': self._vf_loss, # 'steps': torch.tensor(self.global_step),#.to(device),#Where did this global_step comee from is it PL inbuilt? 'max_ret': self.max_path_return, 'last_ret': self.last_path_return, 'gmm_mu': torch.mean(distributions.component_distribution.mean).detach(). cpu().numpy(), 'gmm_sig': torch.mean(distributions.component_distribution.stddev).detach( ).cpu().numpy(), 'vf_loss': self._vf_loss.detach().cpu().numpy(), 'vf_mu': torch.mean(self.values).detach().cpu().numpy() } return OrderedDict({ 'loss': self._policy_loss + self._vf_loss, 'log': log, 'progress_bar': status }) # TODO is it faster if qf is also optimized simultaneously along with vf and policy? # Train QF if optimizer_idx == 1: # for param in self.qf.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Doneqf") self.q_values = self.qf(states, actions) # assert (self.policy._qf(states,actions)==self.q_values).all() with torch.no_grad(): vf_next_target = self.vf_target(next_states) # N ys = self._scale_reward * rewards + ( 1 - dones) * self._discount * vf_next_target # N self._td_loss = 0.5 * torch.mean((ys - self.q_values)**2) return OrderedDict({ 'loss': self._td_loss, 'log': { 'qf_loss': self._td_loss.detach().cpu().numpy(), 'qf_value': torch.mean(self.q_values).detach().cpu().numpy(), 'rewards': torch.mean(rewards).detach().cpu().numpy() }, 'progress_bar': { 'qf_loss': self._td_loss, 'rewards': torch.mean(rewards).detach().cpu().numpy(), 'qf_mu': torch.mean(self.q_values).detach().cpu().numpy() } }) # if self.trainer.use_dp or self.trainer.use_ddp2: # loss = loss.unsqueeze(0) def on_batch_end(self) -> None: with torch.no_grad(): for vf, vf_targ in zip(self.vf.parameters(), self.vf_target.parameters()): vf_targ.data.mul_(1 - self.hparams.tau) vf_targ.data.add_(self.hparams.tau * vf.data) def validation_step(self, batch, batch_idx) -> OrderedDict: # state = self.eval_env.reset() # print("Running Validation step") # path_return = 0 # path_length = 0 # for i in range(self.config.max_path_length): # action = self.policy.get_actions(state.reshape((1, -1))) # next_ob, reward, terminal, info = self.env.step(action) # state = next_ob # path_return += reward # path_length += 1 # if(terminal): # break return OrderedDict({'val_ret': 0, 'path_len': 0}) def validation_epoch_end(self, outputs) -> OrderedDict: gc.collect() state = self.eval_env.reset() print( datetime.datetime.now( dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z')) # print("Running Validation") path_return = 0 path_length = 0 self.ims = [] with self.policy.deterministic(self.hparams.deterministic_eval): # TODO add support for n_eval_iters for i in range(self.hparams.max_path_length): action = self.policy.get_actions(state.reshape((1, -1))) next_ob, reward, done, info = self.eval_env.step(action) if self.hparams.render_validation: # TODO use common resizing everywhere self.ims.append( cv2.resize(self.eval_env.render(mode='rgb_array'), (500, 500))) # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'} state = next_ob path_return += reward path_length += 1 if done: break self.val_path_return = path_return # TODO : remove printcall back for this, already printed in progress bar return OrderedDict({ 'log': { 'path_return': path_return, 'path_length': path_length }, 'progress_bar': { 'val_ret': path_return, 'path_len': path_length } }) def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizers = [] # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase # compute time by the expected optimizers.append( optim.Adam(list(self.policy.parameters()) + list(self.vf.parameters()), lr=self._policy_lr)) # optimizers.append(optim.Adam(self.vf.parameters(), lr=self._vf_lr)) optimizers.append(optim.Adam(self.qf.parameters(), lr=self._qf_lr)) return optimizers def forward(self, *args, **kwargs): return None def check_modules(self): self.policy.cuda(self.hparams.device) self.vf.cuda(self.hparams.device) self.qf.cuda(self.hparams.device) self.vf_target.cuda(self.hparams.device) for param in self.policy.parameters(): print(param.data.shape, param.data.mean(), param.data.max(), param.data.min(), param.data.std()) for param in self.vf.parameters(): print(param.data.shape, param.data.mean(), param.data.max(), param.data.min(), param.data.std()) for param in self.qf.parameters(): print(param.data.shape, param.data.mean(), param.data.max(), param.data.min(), param.data.std()) for param in self.vf_target.parameters(): print(param.data.shape, param.data.mean(), param.data.max(), param.data.min(), param.data.std())
def __init__(self, config: Config) -> None: self.hparams = config self.env = env_selector(self.hparams) # TODO: ensure normalization is not required self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: add functionality to optionwrap for DIAYN # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device="cpu" ) # GMM policy with K mixtures, no reparametrization trick, regularization # self.policy.cuda(config.device) # self.vf.cuda(config.device) # self.qf.cuda(config.device) # self.vf_target.cuda(config.device) # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy)) # self.optimizers = [] # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase # compute time by the expected self.optimizer_policy = optim.Adam(list(self.policy.parameters()) # +list(self.vf.parameters()) , lr=self._policy_lr) self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr) self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr) self.optimizer = optim.Adam(list(self.policy.parameters())+ list(self.vf.parameters())+ list(self.qf.parameters()), lr=self._policy_lr)
class SAC(): def __init__(self, config: Config) -> None: self.hparams = config self.env = env_selector(self.hparams) # TODO: ensure normalization is not required self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: add functionality to optionwrap for DIAYN # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device="cpu" ) # GMM policy with K mixtures, no reparametrization trick, regularization # self.policy.cuda(config.device) # self.vf.cuda(config.device) # self.qf.cuda(config.device) # self.vf_target.cuda(config.device) # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy)) # self.optimizers = [] # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase # compute time by the expected self.optimizer_policy = optim.Adam(list(self.policy.parameters()) # +list(self.vf.parameters()) , lr=self._policy_lr) self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr) self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr) self.optimizer = optim.Adam(list(self.policy.parameters())+ list(self.vf.parameters())+ list(self.qf.parameters()), lr=self._policy_lr) # torch.autograd.set_detect_anomaly(True) @staticmethod def _squash_correction(t): """receives action samples from gmm of shape batchsize x dim_action. For each action, the log probability correction requires a product by the inverse of the jacobian determinant. In log, it reduces to a sum, including the determinant of the diagonal jacobian. Adding epsilon to avoid overflow due to log Should return a tensor of batchsize x 1""" # TODO: Refer to OpenAI implementation for more numerically stable correction # https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/sac/core.py return torch.sum(torch.log(1 - (t ** 2) + EPS), dim=1) def train(self): for epoch in range(self.hparams.max_epochs): for step in range(self.hparams.epoch_length): samples = self.sampler.sample(1, self.policy) # TODO remove magic numbers self.pool.add_samples(samples) # print(samples[0]['done']) if samples[0]['done'] or samples[0]['path_length'] == self.hparams.max_path_length: self.max_path_return = max(self.max_path_return, samples[0]['path_return']) self.last_path_return = samples[0]['path_return'] batch = self.pool.random_batch(self.hparams.batch_size) states, rewards, actions, dones, next_states = torch.FloatTensor( batch['observations']), torch.FloatTensor(batch['rewards']), torch.FloatTensor( batch['actions']), torch.FloatTensor(batch['dones']), torch.FloatTensor(batch['next_observations']) # self.optimizer_policy.zero_grad() self.optimizer.zero_grad() distributions, action_samples, log_probs, reg_loss = self.policy(states) # print(log_probs.shape) # assert log_probs.shape == torch.Size([action_samples.shape[0]]) # TODO: figure out why squash correction is not done in policy as kl_surrogate seems # to need uncorrected log probs? self.values = self.vf(states) # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes with torch.no_grad(): self.log_targets = self.qf(states, action_samples) # Probability of squashed action is not same as probability of unsquashed action. corr = self._squash_correction(action_samples) # print(log_probs.shape,corr.shape) # assert not torch.isnan(corr).any() and not torch.isinf(corr).any() # correction must be subtracted from log_probs as we need inverse of jacobian determinant. self.scaled_log_pi = self._scale_entropy * (log_probs - corr) # self._vf_loss = 0.5 * torch.mean( # (self.values - self.log_targets - self.scaled_log_pi) ** 2) ## How is this kl surrogate loss derived? self._kl_surrogate_loss = torch.mean(log_probs * ( self.scaled_log_pi - self.log_targets + self.values.detach())) self._policy_loss = reg_loss + self._kl_surrogate_loss # self._policy_loss.backward() # self.optimizer_policy.step() # # self.optimizer_vf.zero_grad() # self.values = self.vf(states) self._vf_loss = 0.5 * torch.mean( (self.values - self.log_targets + self.scaled_log_pi) ** 2) # self._vf_loss.backward() # self.optimizer_vf.step() # # self.optimizer_qf.zero_grad() self.q_values = self.qf(states, actions) # assert (self.policy._qf(states,actions)==self.q_values).all() with torch.no_grad(): vf_next_target = self.vf_target(next_states) # N # self._vf_target_params = self._vf.get_params_internal() ys = self._scale_reward * rewards + (1 - dones) * self._discount * vf_next_target # N self._td_loss = 0.5 * torch.mean((ys - self.q_values) ** 2) #TODO COde not working, need to fix bug self.loss = self._policy_loss + self._vf_loss + self._td_loss self.loss.backward() self.optimizer.step() with torch.no_grad(): for vf, vf_targ in zip(self.vf.parameters(), self.vf_target.parameters()): vf_targ.data.mul_(1 - self.hparams.tau) vf_targ.data.add_((self.hparams.tau) * vf.data) print('train_loss: ', self._policy_loss.detach().numpy(), 'epoch: ', epoch, # 'vf_loss': self._vf_loss, # 'steps': torch.tensor(self.global_step),#.to(device),#Where did this global_step comee from is it PL inbuilt? 'max_return: ', (self.max_path_return), 'last_return: ', (self.last_path_return), # 'gmm_means: ', torch.mean(distributions.component_distribution.mean).detach().numpy(), # 'gmm_sigmas: ', torch.mean(distributions.component_distribution.stddev).detach().numpy(), 'vf_loss: ', self._vf_loss.detach().numpy(), 'vf_value: ', torch.mean(self.values).detach().numpy(), 'qf_loss: ', self._td_loss.detach().numpy(), 'rewards: ', torch.mean(rewards).detach().numpy(), 'actions: ', torch.mean(actions).detach().numpy(), 'qf_value: ', torch.mean(self.q_values).detach().numpy() ) state = self.eval_env.reset() # print("Running Validation") path_return = 0 path_length = 0 self.ims = [] print(datetime.datetime.now(dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z')) # with self.policy.deterministic(True): # for i in range(self.hparams.max_path_length): # action = self.policy.get_actions(state.reshape((1, -1))) # next_ob, reward, done, info = self.eval_env.step(action) # if self.hparams.render_validation: # self.ims.append(self.eval_env.render(mode='rgb_array')) # # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'} # # print(reward) # state = next_ob # path_return += reward # path_length += 1 # if (done): # break self.val_path_return = path_return print('path_return: ', path_return, 'path_length: ', path_length)
def train(algo, opt, model_type, batch_size, learning_rate, num_epochs, stop_at_done, gamma, tau, num_workers, task_name, file_index, num_actions, max_requests, starting_request, random_start, critic_coef, actor_coef, entropy_coef, output_dir, output_prefix, save_interval): assert model_type in MODEL_TYPES, "Invalid model type. Choices: {}".format(MODEL_TYPES) assert opt in OPT_TYPES, "Invalid optimizer type. Choices: {}".format(OPT_TYPES) assert algo in ALGOS, "Invalid algorithm. Choices: {}".format(ALGOS) assert task_name in TASKS, "Invalid task. Choices: {}".format(TASKS) assert file_index in FILE_INDEX, "Invalid file index. Choices: {}".format(FILE_INDEX) assert num_actions in CACHE_SIZE, "Invalid number of actions. Choices: {}".format(CACHE_SIZE) assert max_requests in MAX_REQUESTS, "Invalid maximum requests allowed. Choices: {}".format(MAX_REQUESTS) assert num_workers >= 0, "Invalid number of workers ({}). Must be at least 0.".format(num_workers) assert num_epochs >= 1, "Invalid number of epochs ({}). Must be at least 1.".format(num_epochs) assert 1 <= save_interval <= num_epochs, "Invalid save interval ({}). Must be between 1 and {}".format(save_interval, num_epochs) num_feature = num_actions * 3 # Setup environment task_name = "Cache-Bandit-C{}-Max{}-{}-{}-v0".format(num_actions, max_requests, task_name, file_index) opt_construct = optim.Adam if opt == OPT_ADAM else optim.SGD # Create the model if (model_type == GRU and algo == REINFORCE): model = GRUPolicy(num_actions, num_feature) # Set the optimizer optimizer = opt_construct(model.parameters(), lr=learning_rate) agent = Reinforce(model, optimizer, entropy_coef) elif(model_type == GRU and algo == A2C): model = GRUActorCritic(num_actions, num_feature) # Set the optimizer optimizer = opt_construct(model.parameters(), lr=learning_rate) agent = AdvantageActorCritic(model, optimizer, critic_coef, actor_coef, entropy_coef) model = model.to(DEVICE) model.train() # Setup sampler sampler = Sampler(model, task_name, num_actions, deterministic=False, gamma=gamma, tau=tau, num_workers=num_workers) def _random_start(max_request): return random.randint(0, max(0, max_request - 1 - num_actions)) get_starting_point = _random_start if random_start else lambda x: starting_request print(optimizer) print(model) print("Stop after singlefull trajectory is completed for each epoch: {}".format(stop_at_done)) if not os.path.isdir(output_dir): print("Constructing directories {}".format(output_dir)) os.makedirs(output_dir, exist_ok=True) print("Output Directory: {}".format(output_dir)) for epoch in range(num_epochs): print("EPOCH {} ==========================================".format(epoch)) sampler.reset_storage() sampler.last_hidden_state = None if (MAP_LOCATION == CUDA): torch.cuda.empty_cache() sampler.sample(batch_size, stop_at_done=stop_at_done, starting_point=get_starting_point(sampler.max_length)) sampler.concat_storage() agent.update(sampler) if ((epoch + 1) % save_interval == 0): out_file = '{}/{}_{}.pkl'.format(output_dir.rstrip("/"), output_prefix, epoch) print("Saving model as {}".format(out_file)) torch.save(model, out_file) print("DONE") sampler.envs.close()
dims = np.max(df.to_numpy().astype(int), axis=0) + 1 ''' GET GROUND-TRUTH AND CANDIDATES ''' # get ground truth test_ur = get_ur(test_set, context=args.context, eval=False) val_ur = get_ur(val_set, context=args.context, eval=False) total_train_ur = get_ur(train_set, context=args.context, eval=True) # initial candidate item pool item_pool = set(range(dims[0], dims[1])) candidates_num = args.cand_num print('=' * 50, '\n') ''' FORMAT DATA ''' sampler = Sampler( dims, num_ng=args.num_ng, sample_method=args.sample_method, sample_ratio=args.sample_ratio, ) # negative sampling and adjacency matrix construction neg_set, adj_mx = sampler.transform(train_set, is_training=True, context=args.context, pair_pos=None) # create graph needed structure if it is activated if args.gce: # embed() if args.mh > 1: print(f'[ MULTI HOP {args.mh} ACTIVATED ]') adj_mx = adj_mx.__pow__(int(args.mh))
smote = BorderlineSMOTE() train_x, train_y = smote.fit_resample(tr_x, tr_y) train_aux, _ = smote.fit_resample(tr_aux, tr_y) else: train_x = tr_x train_y = tr_y train_aux = tr_aux train_set = MeatData(train_x, train_y, train_aux, transform_x, transform_aux) test_set = MeatData(te_x, te_y, te_aux, transform_x, transform_aux) if args.sampler_type.lower() == 'binomial': sampler = ImbalancedDatasetSampler(train_set) elif args.sampler_type.lower() == 'down': sampler = Sampler(train_set, type='under') elif args.sampler_type.lower() == 'up': sampler = Sampler(train_set, type='over') else: sampler = None else: train_set = MeatData(tr_x, tr_y, tr_aux, transform_x, transform_aux) test_set = MeatData(te_x, te_y, te_aux, transform_x, transform_aux) sampler = None # re-weighting if args.train_rule.lower() == 'reweight': beta = 0.9999 effective_num = 1.0 - np.power(beta, cls_num_list) per_cls_weights = (1.0 - beta) / np.array(effective_num)
class DeepLearn: def __init__(self): self.config = ConfigParser() self.config.read( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + os.sep + 'config' + os.sep + 'appConfig.ini') self.score = 0 self.model = None self.data = None self.labels = None self.sampler = Sampler() def init_deep_learning(self): self.process_data() self.create_model() self.train_model() def process_data(self): location = self.config['img']['train_data_set_location'] self.sampler.read_and_process_images(location) self.data, self.labels = self.sampler.get_images_and_labels() #TODO Add Data Split for Training and Validation Set def create_model(self): """Creates a Deep Learning Convolutional Neural Net Model""" # Layer 1: Conv self.model = Sequential() self.model.add( Conv2D(32, (5, 5), strides=(1, 1), padding='same', input_shape=self.data.shape[1:])) self.model.add(Activation('relu')) # Layer 2: Conv self.model.add(Conv2D(32, (5, 5), strides=(1, 1))) self.model.add(Activation('relu')) # Layer 3: MaxPool self.model.add(MaxPooling2D(pool_size=(2, 2))) self.model.add(Dropout(0.25)) # Layer 4: Conv self.model.add(Conv2D(32, (5, 5), strides=(1, 1))) self.model.add(Activation('relu')) # Layer 5: Conv self.model.add(Conv2D(32, (5, 5), strides=(1, 1))) self.model.add(Activation('relu')) # Layer 6: MaxPool self.model.add(MaxPooling2D(pool_size=(2, 2))) self.model.add(Dropout(0.25)) # Layer 7: Flatten self.model.add(Flatten()) # Layer 8: Dense self.model.add(Dense(512)) self.model.add(Activation('relu')) self.model.add(Dropout(0.5)) # Layer 9: Dense Final Classification self.model.add(Dense(len(label_dict.keys()))) self.model.add(Activation('softmax')) # TODO Promote to Logging print(self.model.summary()) def train_model(self): """Trains Model """ self.model.compile( loss='categorical_crossentropy', optimizer=SGD( lr=float(self.config['hyperparameters']['learning_rate']), momentum=float(self.config['hyperparameters']['momentum']), decay=float(self.config['hyperparameters']['decay']), nesterov=False), metrics=['accuracy']) # Split data and labels into training, validation and test sets x_train, x_test, y_train, y_test = train_test_split( self.data, self.labels, test_size=float(self.config['hyperparameters']['split']), random_state=42) x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=float(self.config['hyperparameters']['split']), random_state=42) # One Hot Encoding for Output Labels y_train = to_categorical(y_train, len(label_dict.keys())) y_val = to_categorical(y_val, len(label_dict.keys())) y_test = to_categorical(y_test, len(label_dict.keys())) # Train history = self.model.fit( x_train, y_train, batch_size=int(self.config['hyperparameters']['batch_size']), epochs=int(self.config['hyperparameters']['epochs']), verbose=1, validation_data=(x_val, y_val)) self.plot_loss_accuracy(history) self.score = self.model.evaluate(x_test, y_test) # TODO Promote to Logging print("Accuracy %.6f" % self.score[1]) self.model.save('kiera_trained.h5') with open('accuracy.txt', mode='w') as f: f.write(self.score[1]) def plot_loss_accuracy(self, history): fig, ax = plt.subplots(1, 2, figsize=(12, 6)) ax[0].plot(history.history["loss"], 'r-x', label="Train Loss") ax[0].plot(history.history["val_loss"], 'b-x', label="Validation Loss") ax[0].legend() ax[0].set_title('cross_entropy loss') ax[0].grid(True) ax[1].plot(history.history["acc"], 'r-x', label="Train Accuracy") ax[1].plot(history.history["val_acc"], 'b-x', label="Validation Accuracy") ax[1].legend() ax[1].set_title('accuracy') ax[1].grid(True) plt.savefig('LossAndAccuracy.png') def get_accuracy(self): if not self.score: return "Training Not Initiated" return self.score[1] def predict(self, img): return 5 try: if not self.model: self.model = load_model('kiera_trained.h5') return label_dict[ self.model.predict(self.sampler.process_image(img)).argmax() + 1] except Exception as e: raise Exception('Model Not Saved')
def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.q1 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.q1_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q1_target.load_state_dict(self.q1.state_dict()) self.q2_target.load_state_dict(self.q2.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], #TODO: pass both q functions to use policy in deterministic mode qf=self.q1_target, reg=config.reg, device=self.hparams.device, reparametrization=True ) # GMM policy with K mixtures, no reparametrization trick, regularization # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state self.modules = [ "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target", self.q1_target, "Q2_target", self.q2_target ] # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.batch_idx = None
class SAC(pl.LightningModule): def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.q1 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.q1_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q1_target.load_state_dict(self.q1.state_dict()) self.q2_target.load_state_dict(self.q2.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], #TODO: pass both q functions to use policy in deterministic mode qf=self.q1_target, reg=config.reg, device=self.hparams.device, reparametrization=True ) # GMM policy with K mixtures, no reparametrization trick, regularization # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state self.modules = [ "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target", self.q1_target, "Q2_target", self.q2_target ] # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.batch_idx = None # torch.autograd.set_detect_anomaly(True) #TODO: disable if compute overhead def get_best_skill(self, policy, env, num_skills, max_path_length, n_paths=1): print('Finding best skill...') reward_list = [] with policy.deterministic(self.hparams.deterministic_eval): for z in range(num_skills): env.reset(state=None, skill=z) total_returns = 0 sampler = Sampler(env, max_path_length) for p in range(n_paths): new_paths = sampler.sample(max_path_length, policy) total_returns += new_paths[-1]['path_return'] print('Reward for skill %d = %.3f' % (z, total_returns)) reward_list.append(total_returns) best_z = np.argmax(reward_list) print('Best skill found: z = %d, reward = %d, seed = %d' % (best_z, reward_list[best_z], self.hparams.seed)) return best_z def on_sanity_check_start(self) -> None: # self.z = self.get_best_skill(self.policy, self.env, self.hparams.num_skills, self.hparams.max_path_length, # self.hparams.num_runs) # self._num_skills = self.hparams.num_skills # self.env.reset(state=None, skill=self.z) # self.eval_env.reset(state=None, skill=self.z) # # TODO sampler reset logic and epoch length interaction seems adhoc # self.sampler.reset() if self.pool.size < self.hparams.min_pool_size: self.pool.add_samples( self.sampler.sample(self.hparams.min_pool_size, None)) print("Initialized Replay Buffer with %d samples" % self.pool.size) def __dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" dataset = RLDataset(self.pool, self.hparams.epoch_length, self.hparams.batch_size) # TODO: figure out why referencee codeee uses episode length abovee instead of batch size def _init_fn(worker_id): np.random.seed(self.hparams.seed + worker_id) dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers, worker_init_fn=_init_fn) return dataloader def train_dataloader(self) -> DataLoader: """Get train loader""" return self.__dataloader() def val_dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" dataset = RLDataset(self.pool, 1, 1) # TODO: figure out why referencee codeee uses episode length abovee instead of batch size dataloader = DataLoader( dataset=dataset, batch_size=1, # num_workers=5 ) return dataloader # def _split_obs(self,t): # # TODO: verify that dim is 1, assert shape # return torch.split(t, [self._Do, self._num_skills], 1) def training_step(self, batch, batch_idx, optimizer_idx) -> OrderedDict: states, actions, rewards, dones, next_states = batch self.batch_idx = batch_idx # print(states[0], batch_idx) # print(self.pool.size,optimizer_idx,batch_idx,states[0]) # print("Running train",states.shape,batch_idx,optimizer_idx) # TODO: vars are already floatTensors. # Train Policy if optimizer_idx == 1: # for param in self.policy.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Done") # for param in self.vf.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Donevf") # print(torch.max(rewards),torch.min(rewards),torch.mean(rewards)) samples = self.sampler.sample( 1, self.policy) # TODO remove magic numbers self.pool.add_samples(samples) if samples[0]['done'] or samples[0][ 'path_length'] == self.hparams.max_path_length: self.max_path_return = max(self.max_path_return, samples[0]['path_return']) self.last_path_return = samples[0]['path_return'] distributions, action_samples, log_probs, corr, reg_loss = self.policy( states) # print(log_probs.shape) assert log_probs.shape == torch.Size([action_samples.shape[0]]) values1 = self.q1(states, action_samples) values2 = self.q2(states, action_samples) self.value = torch.min(values1, values2) # N # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes # with torch.no_grad(): # TODO : check grad self.scaled_log_pi = self._scale_entropy * (log_probs - corr) self._policy_loss = torch.mean(self.scaled_log_pi - self.value) log = { 'max_path_return': torch.tensor(self.max_path_return), 'train_loss': self._policy_loss, 'reg_loss': reg_loss, 'vf_value': torch.mean(self.value) } status = { 'train_loss': self._policy_loss, 'max_ret': torch.tensor(self.max_path_return), 'last_ret': torch.tensor(self.last_path_return), 'vf_mu': torch.mean(self.value) } return OrderedDict({ 'loss': self._policy_loss, 'log': log, 'progress_bar': status }) # Train QF if optimizer_idx == 0: # for param in self.qf.parameters(): # print(param.names, param.size(), param.requires_grad) # print("Doneqf") self.q1_values = self.q1(states, actions) self.q2_values = self.q2(states, actions) # assert (self.policy._qf(states,actions)==self.q_values).all() with torch.no_grad(): distributions, action_samples, log_probs, corr, reg_loss = self.policy( next_states) q1_next_target = self.q1_target(next_states, action_samples) # N q2_next_target = self.q2_target(next_states, action_samples) q_next_target = torch.min(q1_next_target, q2_next_target) # N ys = self._scale_reward * rewards + (1 - dones) * self._discount * \ (q_next_target-self._scale_entropy*(log_probs - corr)) # N self._td1_loss = torch.mean((ys - self.q1_values)**2) self._td2_loss = torch.mean((ys - self.q2_values)**2) return OrderedDict({ 'loss': self._td1_loss + self._td2_loss, 'log': { 'qf_loss': self._td1_loss + self._td2_loss, 'qf_value': torch.mean(self.q1_values), 'rewards': torch.mean(rewards) }, 'progress_bar': { 'qf_loss': self._td1_loss + self._td2_loss, 'rewards': torch.mean(rewards), 'qf_mu': torch.mean(self.q1_values), 'log_probs': torch.mean(log_probs - corr) } }) # if self.trainer.use_dp or self.trainer.use_ddp2: # loss = loss.unsqueeze(0) def on_batch_end(self) -> None: with torch.no_grad(): for q1, q1_targ in zip(self.q1.parameters(), self.q1_target.parameters()): q1_targ.data.mul_(1 - self.hparams.tau) q1_targ.data.add_((self.hparams.tau) * q1.data) for q2, q2_targ in zip(self.q2.parameters(), self.q2_target.parameters()): q2_targ.data.mul_(1 - self.hparams.tau) q2_targ.data.add_((self.hparams.tau) * q2.data) def validation_step(self, batch, batch_idx) -> OrderedDict: # state = self.eval_env.reset() # print("Running Validation step") # path_return = 0 # path_length = 0 # for i in range(self.config.max_path_length): # action = self.policy.get_actions(state.reshape((1, -1))) # next_ob, reward, terminal, info = self.env.step(action) # state = next_ob # path_return += reward # path_length += 1 # if(terminal): # break return OrderedDict({'val_ret': 0, 'path_len': 0}) def validation_epoch_end(self, outputs) -> OrderedDict: state = self.eval_env.reset() print( datetime.datetime.now( dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z')) # print("Running Validation") path_return = 0 path_length = 0 self.ims = [] with self.policy.deterministic(self.hparams.deterministic_eval): for i in range(self.hparams.max_path_length): action = self.policy.get_actions(state.reshape((1, -1))) next_ob, reward, done, info = self.eval_env.step(action) # self.eval_env.render(mode='human') if self.hparams.render_validation: self.ims.append(self.eval_env.render(mode='rgb_array')) # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'} # print(reward) state = next_ob path_return += reward path_length += 1 if (done): break self.val_path_return = path_return # TODO : remove printcall back for this, already printed in progress bar return OrderedDict({ 'log': { 'path_return': path_return, 'path_length': path_length }, 'progress_bar': { 'val_ret': path_return, 'path_len': path_length } }) def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizers = [] # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase # compute time by the expected optimizers.append( optim.Adam(list(self.q1.parameters()) + list(self.q2.parameters()), lr=self._qf_lr)) # optimizers.append(optim.Adam(self.vf.parameters(), lr=self._vf_lr)) optimizers.append( optim.Adam(self.policy.parameters(), lr=self._policy_lr)) return optimizers def forward(self, *args, **kwargs): return None