def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = Buffer(buffer_maxlen)
def __init__(self, env_id, action_space, action_bound): self.env_id = env_id self.action_space = action_space self.action_bound = action_bound self.env = gym.make(self.env_id) self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES) self.policy = GaussianPolicy(action_space=self.action_space, action_bound=self.action_bound) self.duqlqnet = DualQNetwork() self.target_dualqnet = DualQNetwork() self.log_alpha = tf.Variable(0.) #: alpha=1 self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4) self.target_entropy = -0.5 * self.action_space self.global_steps = 0 self._initialize_weights()
def __init__(self, env, param=None): super(PPO, self).__init__(env, param=param) self.name = 'PPO' self.critic = ValueFunction(self.param.value , self.device) self.actor = GaussianPolicy(self.param.policy, self.device) self.steps = 0 self.episode_steps = 0 if self.param.LR_SCHEDULE: schedule = lambda epoch: 1 - epoch/(self.param.evaluation['total_timesteps'] // self.param.BATCH_SIZE) else: schedule = lambda epoch: 1 self.actor_scheduler = optim.lr_scheduler.LambdaLR(self.actor.optimizer, schedule) self.critic_scheduler = optim.lr_scheduler.LambdaLR(self.critic.optimizer, schedule)
def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = Buffer(buffer_maxlen)
def get_policy(args, env): N = env.observation_space.shape[0] M = env.action_space.shape[0] if args.init_policy == 'optimal': K = env.optimal_controller() mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) elif args.init_policy == 'linear': K = np.random.randn(M, N) mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) elif args.init_policy == 'linear_bias': K = np.random.randn(M, N) mean_network = nn.Linear(*K.shape[::-1], bias=True) mean_network.weight.data = tensor(K) elif args.init_policy == 'mlp': mean_network = get_mlp((N, ) + tuple(args.hidden_sizes) + (M, ), gate=nn.Tanh) else: raise Exception('unsupported policy type') return GaussianPolicy(N, M, mean_network, learn_std=not args.fix_std, gate_output=args.gate_output)
def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['nn_policy']['l1'][0] = state_size nn_params['nn_policy']['l3'][1] = action_size nn_params['nn_value_function']['l1'][0] = state_size self.__policy = GaussianPolicy(nn_params['nn_policy']).to(device) self.__value_fn = ValueFunction( nn_params['nn_value_function']).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = Buffer(buf_params) self.gamma = params['gamma'] self.learning_rate_policy = params['learning_rate_policy'] self.learning_rate_value_fn = params['learning_rate_value_fn'] self.tau = params['tau'] self.updates_num = params['updates_num'] self.ppo_epochs = params['ppo_epochs'] self.baseline_epochs = params['baseline_epochs'] self.ppo_eps = params['ppo_epsilon'] self.__optimiser_policy = optim.Adam(self.__policy.parameters(), self.learning_rate_policy, weight_decay=1e-5) self.__optimiser_value_fn = optim.Adam(self.__value_fn.parameters(), self.learning_rate_value_fn) # other parameters self.agent_loss = 0.0
def __init__(self, env, param=None): super(TRPO, self).__init__(env, param=param) self.name = "TRPO" self.critic = ValueFunction(self.param.value, self.device) self.actor = GaussianPolicy(self.param.policy, self.device) self.steps = 0
class AgentPPO: def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['nn_policy']['l1'][0] = state_size nn_params['nn_policy']['l3'][1] = action_size nn_params['nn_value_function']['l1'][0] = state_size self.__policy = GaussianPolicy(nn_params['nn_policy']).to(device) self.__value_fn = ValueFunction( nn_params['nn_value_function']).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = Buffer(buf_params) self.gamma = params['gamma'] self.learning_rate_policy = params['learning_rate_policy'] self.learning_rate_value_fn = params['learning_rate_value_fn'] self.tau = params['tau'] self.updates_num = params['updates_num'] self.ppo_epochs = params['ppo_epochs'] self.baseline_epochs = params['baseline_epochs'] self.ppo_eps = params['ppo_epsilon'] self.__optimiser_policy = optim.Adam(self.__policy.parameters(), self.learning_rate_policy, weight_decay=1e-5) self.__optimiser_value_fn = optim.Adam(self.__value_fn.parameters(), self.learning_rate_value_fn) # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr_policy, lr_value_fn): self.learning_rate_policy = lr_policy self.learning_rate_value_fn = lr_value_fn for param_group in self.__optimiser_policy.param_groups: param_group['lr'] = lr_policy for param_group in self.__optimiser_value_fn.param_groups: param_group['lr'] = lr_value_fn # Get methods def get_actor(self): return self.__policy def get_critic(self): return self.__value_fn # Other methods def step(self, state, action, reward, next_state, done, log_probs): self.__memory.add(state, action, reward, next_state, done, log_probs) if self.__memory.is_full(): experience = self.__memory.get_data() self.__update(experience) def choose_action(self, state, mode='train'): if mode == 'train': # state should be transformed to a tensor state = torch.from_numpy(np.array(state)).float().to(device) self.__policy.eval() with torch.no_grad(): actions, log_probs, mean, std = self.__policy.sample_action( state) self.__policy.train() return list(actions.cpu().numpy().squeeze()), log_probs.cpu( ).numpy(), mean.cpu().numpy(), std.cpu().numpy() elif mode == 'test': pass else: print("Invalid mode value") def reset(self, sigma): pass def __update(self, experience, batch_size=256): states, actions, rewards, next_states, dones, log_probs_old = list( experience) T = rewards.shape[0] last_return = torch.zeros(rewards.shape[1]).float().to(device) returns = torch.zeros(rewards.shape).float().to(device) for t in reversed(range(T)): last_return = rewards[t] + last_return * self.gamma * (1 - dones[t]) returns[t] = last_return states = states.view(-1, self.__state_size) actions = actions.view(-1, self.__action_size) returns = returns.view(-1, 1) dones = dones.view(-1, 1) log_probs_old = log_probs_old.view(-1, 1) updates_num = states.shape[0] // batch_size # Critic update for _ in range(self.baseline_epochs): for _ in range(updates_num): idx = np.random.randint(0, states.shape[0], batch_size) states_batch = states[idx] returns_batch = returns[idx] self.__optimiser_value_fn.zero_grad() value_pred = self.__value_fn(states_batch).view(-1, 1) loss_fn = nn.MSELoss() value_loss = loss_fn(value_pred, returns_batch) value_loss.backward() torch.nn.utils.clip_grad_norm_(self.__value_fn.parameters(), 10) self.__optimiser_value_fn.step() # Policy update for _ in range(self.ppo_epochs): for _ in range(updates_num): idx = np.random.randint(0, states.shape[0], batch_size) states_batch = states[idx] actions_batch = actions[idx] returns_batch = returns[idx] log_probs_old_batch = log_probs_old[idx] advantages = (returns_batch - self.__value_fn(states_batch).detach()).view( -1, 1) advantages = (advantages - advantages.mean()) / advantages.std() self.__optimiser_policy.zero_grad() log_probs_batch = self.__policy.evaluate_actions( states_batch, actions_batch).view(-1, 1) ratio = (log_probs_batch - log_probs_old_batch).exp() clipped_fn = torch.clamp(ratio * advantages, 1.0 - self.ppo_eps, 1.0 + self.ppo_eps) surrogate_fn = torch.min(ratio * advantages, clipped_fn) # entropy = F.kl_div(log_probs_batch.view(-1, 1), log_probs_old_batch.view(-1, 1)) policy_loss = -surrogate_fn.mean() policy_loss.backward() torch.nn.utils.clip_grad_norm_(self.__policy.parameters(), 10) self.__optimiser_policy.step()
def train(args): # Initialize data type dtype = torch.float32 torch.set_default_dtype(dtype) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # Initialize environment env = gym.make(args.env_id) envname = env.spec.id obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Initialize random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Initialize neural nets policy = GaussianPolicy(obs_dim, act_dim, args.hidden_size, args.activation, args.logstd) value_net = Value(obs_dim, args.hidden_size, args.activation) cvalue_net = Value(obs_dim, args.hidden_size, args.activation) policy.to(device) value_net.to(device) cvalue_net.to(device) # Initialize optimizer pi_optimizer = torch.optim.Adam(policy.parameters(), args.pi_lr) vf_optimizer = torch.optim.Adam(value_net.parameters(), args.vf_lr) cvf_optimizer = torch.optim.Adam(cvalue_net.parameters(), args.cvf_lr) # Initialize learning rate scheduler lr_lambda = lambda it: max(1.0 - it / args.max_iter_num, 0) pi_scheduler = torch.optim.lr_scheduler.LambdaLR(pi_optimizer, lr_lambda=lr_lambda) vf_scheduler = torch.optim.lr_scheduler.LambdaLR(vf_optimizer, lr_lambda=lr_lambda) cvf_scheduler = torch.optim.lr_scheduler.LambdaLR(cvf_optimizer, lr_lambda=lr_lambda) # Store hyperparameters for log hyperparams = vars(args) # Initialize RunningStat for state normalization, score queue, logger running_stat = RunningStats(clip=5) score_queue = deque(maxlen=100) cscore_queue = deque(maxlen=100) logger = Logger(hyperparams) # Get constraint bounds cost_lim = get_threshold(envname, constraint=args.constraint) # Initialize and train FOCOPS agent agent = FOCOPS(env, policy, value_net, cvalue_net, pi_optimizer, vf_optimizer, cvf_optimizer, args.num_epochs, args.mb_size, args.c_gamma, args.lam, args.delta, args.eta, args.nu, args.nu_lr, args.nu_max, cost_lim, args.l2_reg, score_queue, cscore_queue, logger) start_time = time.time() for iter in range(args.max_iter_num): # Update iteration for model agent.logger.save_model('iter', iter) # Collect trajectories data_generator = DataGenerator(obs_dim, act_dim, args.batch_size, args.max_eps_len) rollout = data_generator.run_traj(env, agent.policy, agent.value_net, agent.cvalue_net, running_stat, agent.score_queue, agent.cscore_queue, args.gamma, args.c_gamma, args.gae_lam, args.c_gae_lam, dtype, device, args.constraint) # Update FOCOPS parameters agent.update_params(rollout, dtype, device) # Update learning rates pi_scheduler.step() vf_scheduler.step() cvf_scheduler.step() # Update time and running stat agent.logger.update('time', time.time() - start_time) agent.logger.update('running_stat', running_stat) # Save and print values agent.logger.dump()
class TRPO(BaseRL, OnPolicy): def __init__(self, env, param=None): super(TRPO, self).__init__(env, param=param) self.name = "TRPO" self.critic = ValueFunction(self.param.value, self.device) self.actor = GaussianPolicy(self.param.policy, self.device) self.steps = 0 def act(self, state, deterministic=False): self.steps += 1 with torch.no_grad(): if self.steps < self.param.DELAYED_START: action = self.env.action_space.sample() else: self.actor.eval() action = self.actor(torch.from_numpy(state).float().to( self.device), deterministic=deterministic).cpu().numpy() next_state, reward, done, _ = self.env.step(action) if not deterministic: done_bool = float( done ) #if self.episode_steps < self.env._max_episode_steps else 0 self.critic.eval() value, next_value = self.critic( torch.from_numpy(np.stack([state, next_state ])).float().to(self.device)) # value = self.critic(torch.from_numpy(state).float().to(self.device)) # next_value = self.critic(torch.from_numpy(next_state).float().to(self.device)) log_pi = self.actor.log_prob( torch.from_numpy(state).float().to(self.device), torch.from_numpy(action).float().to(self.device)) self.memory.store(state, action, reward, next_state, done_bool, value, next_value, log_pi) if done: self.memory.process_episode( maximum_entropy=self.param.MAX_ENTROPY) return next_state, reward, done @OnPolicy.loop def learn(self): rollouts = self.onPolicyData returns = rollouts['returns_gae'] if self.param.ADVANTAGE_NORMALIZATION: rollouts['advantages'] = (rollouts['advantages'] - rollouts['advantages'].mean()) / ( rollouts['advantages'].std() + 1e-5) for _ in range(self.param.EPOCHS): # Compute Advantages for _ in range(self.param.VALUE_EPOCHS): # Update Critic values = self.critic(rollouts['states']) critic_loss = F.mse_loss(values, returns) self.critic.optimize(critic_loss) # Update Actor old_log_probs = self.actor.log_prob(rollouts['states'], rollouts['actions']) pg = self.policy_gradient(rollouts) npg = self.natural_gradient(pg, rollouts) parameters, pg_norm = self.linesearch(npg, pg, rollouts) self.optimize_actor(parameters) log_probs = self.actor.log_prob(rollouts['states'], rollouts['actions']) metrics = dict() with torch.no_grad(): metrics['explained_variance'] = ( 1 - (rollouts['returns_mc'] - rollouts['values']).pow(2).sum() / (rollouts['returns_mc'] - rollouts['returns_mc'].mean()).pow(2).sum()).item() metrics['entropy'] = self.actor.entropy( rollouts['states']).mean().item() metrics['kl'] = (old_log_probs - log_probs).mean() metrics['pg_norm'] = pg_norm return metrics ################################################################ ########################## Utilities ########################### ################################################################ def optimize_actor(self, new_parameters): vector_to_parameters(new_parameters, self.actor.parameters()) def policy_gradient(self, rollouts): log_probs = self.actor.log_prob(rollouts['states'], rollouts['actions']) pg_objective = (log_probs * rollouts['advantages']).mean() pg_objective -= self.param.ENTROPY_COEFFICIENT * rollouts[ 'log_probs'].mean() return parameters_to_vector( torch.autograd.grad(pg_objective, self.actor.parameters())) def natural_gradient(self, pg, rollouts): def Hx(x): ''' Computes the Hessian-Vector product for the KL-Divergance ''' d_kl = self.get_kl(self.actor, rollouts) grads = torch.autograd.grad(d_kl, self.actor.parameters(), create_graph=True) grads = parameters_to_vector(grads) Jx = torch.sum(grads * x) Hx = torch.autograd.grad(Jx, self.actor.parameters()) Hx = parameters_to_vector(Hx) return Hx + self.param.CG_DAMPING * x stepdir = self.conjugate_gradient(Hx, pg, self.param.NUM_CG_ITER) stepsize = (2 * self.param.DELTA) / torch.dot(stepdir, Hx(stepdir)) return torch.sqrt(stepsize) * stepdir def gae(self, rollouts): ''' Generaized Advantage Estimation ''' states = torch.cat((rollouts.state, rollouts.next_state[-1:])) with torch.no_grad(): values = self.critic(states).numpy() rewards = rollouts.reward.numpy() deltas = rewards + self.param.GAMMA * values[1:] - values[:-1] # rrlab magic discounting returns = scipy.signal.lfilter([1], [1, float(-self.param.GAMMA)], rewards[::-1], axis=0).astype('float32') advantages = scipy.signal.lfilter( [1], [1, float(-self.param.GAMMA * self.param.LAMBDA)], deltas[::-1], axis=0).astype('float32') return torch.flip(torch.from_numpy(advantages), dims=[0]), torch.flip(torch.from_numpy(returns), dims=[0]) def get_kl(self, model, rollouts): ''' Computes the KL-Divergance between the current policy and the model passed ''' with torch.no_grad(): p_old = self.actor.policy(rollouts['states']) p_new = model.policy(rollouts['states']) d_kl = kl_divergence(p_old, p_new).sum(dim=-1, keepdim=True).mean() return d_kl def conjugate_gradient(self, A, b, n): x = torch.zeros_like(b) r = b.clone() p = r.clone() rs = torch.dot(r, r) for i in range(n): if callable(A): Ap = A(p) else: Ap = torch.matmul(A, p) alpha = rs / torch.dot(p, Ap) x += alpha * p r -= alpha * Ap rs_next = torch.dot(r, r) betta = rs_next / rs p = r + betta * p rs = rs_next if rs < 1e-10: break return x def linesearch(self, npg, pg, rollouts): params_curr = parameters_to_vector(self.actor.parameters()) for k in range(self.param.NUM_BACKTRACK): params_new = params_curr + self.param.ALPHA**k * npg model_new = deepcopy(self.actor) vector_to_parameters(params_new, model_new.parameters()) param_diff = params_new - params_curr surr_loss = torch.dot(pg, param_diff) kl_div = self.get_kl(model_new, rollouts) if surr_loss >= 0 and kl_div <= self.param.DELTA: params_curr = params_new break return params_curr, (self.param.ALPHA**k * npg).norm()
def compare_cost(args): set_seed(args.seed) env = LQR( #N=20, #M=12, init_scale=1.0, max_steps=args.H, # 10, 20 Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=0.0, ) K = env.optimal_controller() mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) # mc mc_costs = [] # individual mc_means = [] # cumulative for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) _, _, rewards, _, _ = rollout(env, policy, noises) mc_costs.append(-rewards.sum()) mc_means.append(np.mean(mc_costs)) # rqmc rqmc_costs = [] rqmc_means = [] rqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'trajwise') for i in tqdm(range(args.n_trajs), 'rqmc'): _, _, rewards, _, _ = rollout(env, policy, rqmc_noises[i]) rqmc_costs.append(-rewards.sum()) rqmc_means.append(np.mean(rqmc_costs)) # array rqmc arqmc_costs_dict = {} arqmc_means_dict = {} arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'ssj') #arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'array') for sorter in args.sorter: arqmc_costs = [] arqmc_means = [] sort_f = get_sorter(sorter, env) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: rewards = np.asarray(traj['rewards']) arqmc_costs.append(-rewards.sum()) arqmc_means.append(np.mean(arqmc_costs)) arqmc_costs_dict[sorter] = arqmc_costs arqmc_means_dict[sorter] = arqmc_means expected_cost = env.expected_cost(K, np.diag(np.ones(env.M))) mc_errors = np.abs(mc_means - expected_cost) rqmc_errors = np.abs(rqmc_means - expected_cost) arqmc_errors_dict = { sorter: np.abs(arqmc_means - expected_cost) for sorter, arqmc_means in arqmc_means_dict.items() } logger.info('mc: {}, rqmc: {} '.format(mc_errors[-1], rqmc_errors[-1]) + \ ' '.join(['arqmc ({}): {}'.format(sorter, arqmc_errors[-1]) for sorter, arqmc_errors in arqmc_errors_dict.items()])) info = { **vars(args), 'mc_costs': mc_costs, 'rqmc_costs': rqmc_costs, 'arqmc_costs': arqmc_costs } if args.save_fn is not None: with open(args.save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: data = pd.concat([ pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }), pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }), pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]), ]) plot = sns.lineplot(x='x', y='error', hue='name', data=data) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info
def compare_grad(args): set_seed(args.seed) env = LQR( N=args.xu_dim[0], M=args.xu_dim[1], lims=100, init_scale=1.0, max_steps=args.H, Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=args.noise, ) #K = env.optimal_controller() K = np.random.randn(env.M, env.N) mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) out_set = set() # here Sigma_a = np.diag(np.ones(env.M)) mc_grads = [] for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) states, actions, rewards, _, _ = rollout(env, policy, noises) if len(states) < args.H: out_set.add('mc') break mc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) mc_grads = np.asarray(mc_grads) mc_means = np.cumsum(mc_grads, axis=0) / np.arange( 1, len(mc_grads) + 1)[:, np.newaxis, np.newaxis] rqmc_grads = [] #loc = torch.zeros(env.max_steps * env.M) #scale = torch.ones(env.max_steps * env.M) #rqmc_noises = Normal_RQMC(loc, scale).sample(torch.Size([args.n_trajs])).data.numpy() rqmc_noises = uniform2normal( random_shift( ssj_uniform( args.n_trajs, args.H * env.M, ).reshape(args.n_trajs, args.H, env.M), 0, )) for i in tqdm(range(args.n_trajs), 'rqmc'): states, actions, rewards, _, _ = rollout( env, policy, rqmc_noises[i].reshape(env.max_steps, env.M)) if len(states) < args.H: out_set.add('rqmc') break rqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) rqmc_grads = np.asarray(rqmc_grads) rqmc_means = np.cumsum(rqmc_grads, axis=0) / np.arange( 1, len(rqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict = {} #arqmc_noises = get_rqmc_noises(args.n_trajs, args.H, env.M, 'array') uniform_noises = ssj_uniform(args.n_trajs, env.M) # n_trajs , action_dim arqmc_noises = uniform2normal( random_shift(np.expand_dims(uniform_noises, 1).repeat(args.H, 1), 0)) # n_trajs, horizon, action_dim for sorter in args.sorter: arqmc_grads = [] sort_f = get_sorter(sorter, env, K) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: states, actions, rewards = np.asarray(traj['states']), np.asarray( traj['actions']), np.asarray(traj['rewards']) if len(states) < args.H: out_set.add('arqmc_{}'.format(sorter)) break arqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) arqmc_grads = np.asarray(arqmc_grads) arqmc_means = np.cumsum(arqmc_grads, axis=0) / np.arange( 1, len(arqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict[sorter] = arqmc_means expected_grad = env.expected_policy_gradient(K, Sigma_a) mc_errors = [np.nan] if 'mc' in out_set else (( mc_means - expected_grad)**2).reshape(mc_means.shape[0], -1).mean( 1) # why the sign is reversed? rqmc_errors = [np.nan] if 'rqmc' in out_set else ( (rqmc_means - expected_grad)**2).reshape(rqmc_means.shape[0], -1).mean(1) arqmc_errors_dict = { sorter: [np.nan] if 'arqmc_{}'.format(sorter) in out_set else ((arqmc_means - expected_grad)**2).reshape(arqmc_means.shape[0], -1).mean(1) for sorter, arqmc_means in arqmc_means_dict.items() } info = { **vars(args), 'out': out_set, 'expected_grad': expected_grad, 'means': { 'mc': mc_means, 'rqmc': rqmc_means, **arqmc_means_dict, }, } if args.save_fn is not None: with open(save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: mc_data = pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }) rqmc_data = pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }) arqmc_data = pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]) plot = sns.lineplot(x='x', y='error', hue='name', data=pd.concat([mc_data, rqmc_data, arqmc_data])) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info
class SACAgent: def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = Buffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) _, _, next_zs, next_log_pi = self.policy_net.sample(next_states) next_actions = torch.tanh(next_zs) next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi expected_q = rewards + (1 - dones) * self.gamma * next_q_target # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update q networks self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy network and target q networks _, _, new_zs, log_pi = self.policy_net.sample(states) new_actions = torch.tanh(new_zs) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (self.alpha * log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # update temperature alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp()
class SAC(object): """Soft Actor-Critic algorithm [1] Haarnoja(2018), "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor" """ def __init__( self, env, policy=None, # Learning models nets_hidden_sizes=(64, 64), nets_nonlinear_op='relu', use_q2=True, explicit_vf=False, # RL algorithm behavior total_episodes=10, train_steps=100, eval_rollouts=10, max_horizon=100, fixed_horizon=True, # Target models update soft_target_tau=5e-3, target_update_interval=1, # Replay Buffer replay_buffer_size=1e6, batch_size=64, discount=0.99, # Optimization optimization_steps=1, optimizer='adam', optimizer_kwargs=None, policy_lr=3e-4, qf_lr=3e-4, policy_weight_decay=1.e-5, q_weight_decay=1.e-5, # Entropy entropy_scale=1., auto_alpha=True, max_alpha=10, min_alpha=0.01, tgt_entro=None, # Others norm_input_pol=False, norm_input_vfs=False, seed=610, render=False, gpu_id=-1, ): """Soft Actor-Critic algorithm. Args: env (gym.Env): OpenAI-Gym-like environment with multigoal option. policy (torch.nn.module): A pytorch stochastic Gaussian Policy nets_hidden_sizes (list or tuple of int): Number of units in hidden layers for all the networks. use_q2 (bool): Use two parameterized Q-functions. explicit_vf (bool): total_episodes (int): train_steps (int): eval_rollouts (int): max_horizon (int): fixed_horizon (bool): soft_target_tau (float): target_update_interval (int): replay_buffer_size (int): batch_size (int): discount (float): optimization_steps (int): optimizer (str): optimizer_kwargs (dict): policy_lr (float): qf_lr (float): policy_weight_decay (float): q_weight_decay (float): entropy_scale (float): auto_alpha (int): max_alpha (float): min_alpha (float): tgt_entro (float): norm_input_pol (bool): norm_input_vfs (bool): seed (int): render (bool): gpu_id (int): """ self.seed = seed np.random.seed(seed) torch.cuda.manual_seed(seed) torch.manual_seed(seed) self.env = env self.env.seed(seed) # Algorithm hyperparameters self.obs_dim = np.prod(env.observation_space.shape).item() self.action_dim = np.prod(env.action_space.shape).item() self.total_episodes = total_episodes self.train_steps = train_steps self.eval_rollouts = eval_rollouts self.max_horizon = max_horizon self.fixed_horizon = fixed_horizon self.render = render self.discount = discount self.soft_target_tau = soft_target_tau self.target_update_interval = target_update_interval self.norm_input_pol = norm_input_pol self.norm_input_vfs = norm_input_vfs # Policy Network if policy is None: self.policy = GaussianPolicy( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_pol, ) else: self.policy = policy # Value Function Networks self.qf1 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) if use_q2: self.qf2 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) else: self.qf2 = None if explicit_vf: self.vf = VFunction( self.obs_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_vf = VFunction( self.obs_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_vf.load_state_dict(self.vf.state_dict()) self.target_vf.eval() self.target_qf1 = None self.target_qf2 = None else: self.vf = None self.target_vf = None self.target_qf1 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf1.eval() if use_q2: self.target_qf2 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_qf2.load_state_dict(self.qf2.state_dict()) self.target_qf2.eval() else: self.target_qf2 = None # Replay Buffer self.replay_buffer = ReplayBuffer( max_size=int(replay_buffer_size), obs_dim=self.obs_dim, action_dim=self.action_dim, ) self.batch_size = batch_size # Move models to GPU self.torch_device = \ torch.device("cuda:" + str(gpu_id) if gpu_id >= 0 else "cpu") for model in self.trainable_models + self.non_trainable_models: model.to(device=self.torch_device) # Ensure non trainable models have fixed parameters for model in self.non_trainable_models: model.eval() # # TODO: Should we also set its parameters to requires_grad=False? # for param in model.parameters(): # param.requires_grad = False # ###### # # Alphas # # ###### # self.entropy_scale = torch.tensor(entropy_scale, device=self.torch_device) if tgt_entro is None: tgt_entro = -self.action_dim self.tgt_entro = torch.tensor(tgt_entro, device=self.torch_device) self._auto_alpha = auto_alpha self.max_alpha = max_alpha self.min_alpha = min_alpha self.log_alpha = torch.zeros(1, device=self.torch_device, requires_grad=True) # ########## # # Optimizers # # ########## # self.optimization_steps = optimization_steps if optimizer.lower() == 'adam': optimizer_class = torch.optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = torch.optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Values optimizer qvals_params = self.qf1.parameters() if self.qf2 is not None: qvals_params = chain(qvals_params, self.qf2.parameters()) self.qvalues_optimizer = optimizer_class(qvals_params, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) if self.vf is not None: self.vvalues_optimizer = optimizer_class( self.vf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) else: self.vvalues_optimizer = None # Policy optimizer self._policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizers self._alphas_optimizer = optimizer_class([self.log_alpha], lr=policy_lr, **optimizer_kwargs) # Internal variables self.num_train_interactions = 0 self.num_train_steps = 0 self.num_eval_interactions = 0 self.num_episodes = 0 # Log variables self.logging_qvalues_error = 0 self.logging_vvalues_error = 0 self.logging_policies_error = 0 self.logging_entropy = torch.zeros(self.batch_size) self.logging_mean = torch.zeros((self.batch_size, self.action_dim)) self.logging_std = torch.zeros((self.batch_size, self.action_dim)) self.logging_eval_rewards = torch.zeros(self.eval_rollouts) self.logging_eval_returns = torch.zeros(self.eval_rollouts) @property def trainable_models(self): models = [self.policy, self.qf1] if self.qf2 is not None: models.append(self.qf2) if self.vf is not None: models.append(self.vf) return models @property def non_trainable_models(self): models = [self.target_qf1] if self.target_qf2 is not None: models.append(self.target_qf2) if self.target_vf is not None: models.append(self.target_vf) return models def train(self, init_episode=0): if init_episode == 0: # Eval and log self.eval() self.log(write_table_header=True) gt.reset() gt.set_def_unique(False) expected_accum_rewards = np.zeros(self.total_episodes) episodes_iter = range(init_episode, self.total_episodes) if not logger.get_log_stdout(): # Fancy iterable bar episodes_iter = tqdm.tqdm(episodes_iter) for it in gt.timed_for(episodes_iter, save_itrs=True): # Put models in training mode for model in self.trainable_models: model.train() obs = self.env.reset() rollout_steps = 0 for step in range(self.train_steps): if self.render: self.env.render() interaction_info = interaction( self.env, self.policy, obs, device=self.torch_device, deterministic=False, ) self.num_train_interactions += 1 rollout_steps += 1 gt.stamp('sample') # Add data to replay_buffer self.replay_buffer.add_sample(**interaction_info) # Only train when there are enough samples from buffer if self.replay_buffer.available_samples() > self.batch_size: for ii in range(self.optimization_steps): self.learn() gt.stamp('train') # Reset environment if it is done if interaction_info['termination'] \ or rollout_steps > self.max_horizon: obs = self.env.reset() rollout_steps = 0 else: obs = interaction_info['next_obs'] # Evaluate current policy to check performance expected_accum_rewards[it] = self.eval() self.log() self.num_episodes += 1 return expected_accum_rewards def eval(self): """Evaluate deterministically the Gaussian policy. Returns: np.array: Expected accumulated reward """ # Put models in evaluation mode for model in self.trainable_models: model.eval() for rr in range(self.eval_rollouts): rollout_info = rollout( self.env, self.policy, max_horizon=self.max_horizon, fixed_horizon=self.fixed_horizon, render=self.render, return_info_dict=True, device=self.torch_device, deterministic=True, ) self.logging_eval_rewards[rr] = torch.tensor( rollout_info['reward']).mean() self.logging_eval_returns[rr] = torch.tensor( rollout_info['reward']).sum() self.num_eval_interactions += 1 gt.stamp('eval') return self.logging_eval_returns.mean().item() def learn(self): """Improve the Gaussian policy with the Soft Actor-Critic algorithm. Returns: None """ # Get batch from the replay buffer batch = self.replay_buffer.random_batch(self.batch_size, device=self.torch_device) # Get common data from batch obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] rewards = batch['rewards'] terminations = batch['terminations'] policy_prior_log_prob = 0.0 # Uniform prior # TODO: Normal prior # Alphas alpha = self.entropy_scale * self.log_alpha.exp() # Actions for batch observation new_actions, policy_info = self.policy(obs, deterministic=False, return_log_prob=True) new_log_pi = policy_info['log_prob'] new_mean = policy_info['mean'] new_std = policy_info['std'] # Actions for batch next_observation with torch.no_grad(): next_actions, policy_info = self.policy(next_obs, deterministic=False, return_log_prob=True) next_log_pi = policy_info['log_prob'] # ###################### # # Policy Evaluation Step # # ###################### # if self.target_vf is None: with torch.no_grad(): # Estimate from target Q-value(s) # Q1_target(s', a') next_q1 = self.target_qf1(next_obs, next_actions) if self.target_qf2 is not None: # Q2_target(s', a') next_q2 = self.target_qf2(next_obs, next_actions) # Minimum Unintentional Double-Q next_q = torch.min(next_q1, next_q2) else: next_q = next_q1 # Vtarget(s') next_v = next_q - alpha * next_log_pi else: with torch.no_grad(): # Vtarget(s') next_v = self.target_vf(next_obs) # Calculate Bellman Backup for Q-values q_backup = rewards + (1. - terminations) * self.discount * next_v # Prediction Q(s,a) q1_pred = self.qf1(obs, actions) # Critic loss: Mean Squared Bellman Error (MSBE) qf1_loss = \ 0.5 * torch.mean((q1_pred - q_backup) ** 2, dim=0).squeeze(-1) if self.qf2 is not None: q2_pred = self.qf2(obs, actions) # Critic loss: Mean Squared Bellman Error (MSBE) qf2_loss = \ 0.5 * torch.mean((q2_pred - q_backup)**2, dim=0).squeeze(-1) else: qf2_loss = 0 self.qvalues_optimizer.zero_grad() qvalues_loss = qf1_loss + qf2_loss qvalues_loss.backward() self.qvalues_optimizer.step() # ####################### # # Policy Improvement Step # # ####################### # # TODO: Decide if use the minimum btw q1 and q2. Using new_q1 for now new_q1 = self.qf1(obs, new_actions) new_q = new_q1 # Policy KL loss: - (E_a[Q(s, a) + H(.)]) policy_kl_loss = -torch.mean( new_q - alpha * new_log_pi + policy_prior_log_prob, dim=0) policy_regu_loss = 0 # TODO: It can include regularization of mean, std policy_loss = torch.sum(policy_kl_loss + policy_regu_loss) # Update both Intentional and Unintentional Policies at the same time self._policy_optimizer.zero_grad() policy_loss.backward() self._policy_optimizer.step() # ################################# # # (Optional) V-fcn improvement step # # ################################# # if self.vf is not None: v_pred = self.vf(obs) # Calculate Bellman Backup for Q-values v_backup = new_q - alpha * new_log_pi + policy_prior_log_prob v_backup.detach_() # Critic loss: Mean Squared Bellman Error (MSBE) vf_loss = \ 0.5 * torch.mean((v_pred - v_backup)**2, dim=0).squeeze(-1) self.vvalues_optimizer.zero_grad() vvalues_loss = vf_loss vvalues_loss.backward() self.vvalues_optimizer.step() # ####################### # # Entropy Adjustment Step # # ####################### # if self._auto_alpha: # NOTE: In formula is alphas and not log_alphas alphas_loss = -( self.log_alpha * (new_log_pi.squeeze(-1) + self.tgt_entro).mean(dim=0).detach()) hiu_alphas_loss = alphas_loss.sum() self._alphas_optimizer.zero_grad() hiu_alphas_loss.backward() self._alphas_optimizer.step() self.log_alpha.data.clamp_(min=math.log(self.min_alpha), max=math.log(self.max_alpha)) # ########################### # # Target Networks Update Step # # ########################### # if self.num_train_steps % self.target_update_interval == 0: if self.target_vf is None: soft_param_update_from_to(source=self.qf1, target=self.target_qf1, tau=self.soft_target_tau) if self.target_qf2 is not None: soft_param_update_from_to(source=self.qf2, target=self.target_qf2, tau=self.soft_target_tau) else: soft_param_update_from_to(source=self.vf, target=self.target_vf, tau=self.soft_target_tau) # Always hard_update of input normalizer (if active) if self.norm_input_vfs: if self.target_vf is None: hard_buffer_update_from_to( source=self.qf1, target=self.target_qf1, ) if self.target_qf2 is not None: hard_buffer_update_from_to( source=self.qf2, target=self.target_qf2, ) else: hard_buffer_update_from_to( source=self.vf, target=self.target_vf, ) # Increase internal counter self.num_train_steps += 1 # ######## # # Log data # # ######## # self.logging_policies_error = policy_loss.item() self.logging_qvalues_error = qvalues_loss.item() self.logging_vvalues_error = vvalues_loss.item() \ if self.target_vf is not None else 0. self.logging_entropy.data.copy_(-new_log_pi.squeeze(dim=-1).data) self.logging_mean.data.copy_(new_mean.data) self.logging_std.data.copy_(new_std.data) def save_training_state(self): """Save models Returns: None """ models_dict = { 'policy': self.policy, 'qf1': self.qf1, 'qf2': self.qf2, 'target_qf1': self.target_qf1, 'target_qf2': self.target_qf2, 'vf': self.vf, } replaceable_models_dict = { 'replay_buffer', self.replay_buffer, } logger.save_torch_models(self.num_episodes, models_dict, replaceable_models_dict) def load_training_state(self): pass def log(self, write_table_header=False): logger.log("Logging data in directory: %s" % logger.get_snapshot_dir()) logger.record_tabular("Episode", self.num_episodes) logger.record_tabular("Accumulated Training Steps", self.num_train_interactions) logger.record_tabular("Policy Error", self.logging_policies_error) logger.record_tabular("Q-Value Error", self.logging_qvalues_error) logger.record_tabular("V-Value Error", self.logging_vvalues_error) logger.record_tabular("Alpha", np_ify(self.log_alpha.exp()).item()) logger.record_tabular("Entropy", np_ify(self.logging_entropy.mean(dim=(0, )))) act_mean = np_ify(self.logging_mean.mean(dim=(0, ))) act_std = np_ify(self.logging_std.mean(dim=(0, ))) for aa in range(self.action_dim): logger.record_tabular("Mean Action %02d" % aa, act_mean[aa]) logger.record_tabular("Std Action %02d" % aa, act_std[aa]) # Evaluation Stats to plot logger.record_tabular("Test Rewards Mean", np_ify(self.logging_eval_rewards.mean())) logger.record_tabular("Test Rewards Std", np_ify(self.logging_eval_rewards.std())) logger.record_tabular("Test Returns Mean", np_ify(self.logging_eval_returns.mean())) logger.record_tabular("Test Returns Std", np_ify(self.logging_eval_returns.std())) # Add the previous times to the logger times_itrs = gt.get_times().stamps.itrs train_time = times_itrs.get('train', [0])[-1] sample_time = times_itrs.get('sample', [0])[-1] eval_time = times_itrs.get('eval', [0])[-1] epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) # Dump the logger data logger.dump_tabular(with_prefix=False, with_timestamp=False, write_header=write_table_header) # Save pytorch models self.save_training_state() logger.log("----")
class PPO(BaseRL, OnPolicy): def __init__(self, env, param=None): super(PPO, self).__init__(env, param=param) self.name = 'PPO' self.critic = ValueFunction(self.param.value , self.device) self.actor = GaussianPolicy(self.param.policy, self.device) self.steps = 0 self.episode_steps = 0 if self.param.LR_SCHEDULE: schedule = lambda epoch: 1 - epoch/(self.param.evaluation['total_timesteps'] // self.param.BATCH_SIZE) else: schedule = lambda epoch: 1 self.actor_scheduler = optim.lr_scheduler.LambdaLR(self.actor.optimizer, schedule) self.critic_scheduler = optim.lr_scheduler.LambdaLR(self.critic.optimizer, schedule) def act(self, state, deterministic=False): self.steps += 1 with torch.no_grad(): s = torch.from_numpy(state).float().to(self.device) if self.steps < self.param.DELAYED_START: action = self.env.action_space.sample() else: self.actor.eval() action = self.actor(s, deterministic=deterministic).cpu().numpy() a = torch.from_numpy(action).float().to(self.device) next_state, reward, done, _ = self.env.step(action) if not deterministic: done_bool = float(done) self.critic.eval() s_ = np.stack([state, next_state]) s_ = torch.from_numpy(s_).float().to(self.device) value, next_value = self.critic(s_) log_pi = self.actor.log_prob(s, a) self.memory.store(state, action, reward, next_state, done_bool, value, next_value, log_pi) if done: self.memory.process_episode(maximum_entropy=self.param.MAX_ENTROPY) return next_state, reward, done @OnPolicy.loop def learn(self): pg_norm = 0 rollouts = self.onPolicyData if self.param.ADVANTAGE_NORMALIZATION: rollouts['advantages'] = (rollouts['advantages'] - rollouts['advantages'].mean()) / (rollouts['advantages'].std() + 1e-5) for _ in range(self.param.EPOCHS): generator = self.data_generator(rollouts) for mini_batch in generator: s, a, returns, old_values, old_log_probs, advantages = mini_batch # Critic Step self.critic.train() values = self.critic(s) if self.param.CLIPPED_VALUE: critic_loss = self.clipped_value_loss(old_val, values, returns) else: critic_loss = F.mse_loss(values, returns) self.critic.optimize(critic_loss) # Actor Step self.actor.train() log_probs = self.actor.log_prob(s,a) kl_div = (old_log_probs-log_probs).mean() # Early Stopping if self.param.EARLY_STOPPING and kl_div > 2 * self.param.MAX_KL_DIV: # print('Early stopping at epoch {} due to reaching max kl.'.format(i)) break actor_loss = self.clipped_policy_objective(old_log_probs, log_probs, advantages) actor_loss -= self.param.ENTROPY_COEFFICIENT * log_probs.mean() actor_loss += self.param.CUTOFF_COEFFICIENT * (kl_div > 2 * self.param.MAX_KL_DIV) * (kl_div - self.param.MAX_KL_DIV)**2 pg_norm += self.actor.optimize(actor_loss) self.critic_scheduler.step() self.actor_scheduler.step() metrics = dict() with torch.no_grad(): metrics['explained_variance'] = (1 - (rollouts['returns_mc'] - rollouts['values']).pow(2).sum()/(rollouts['returns_mc']-rollouts['returns_mc'].mean() + 1e-5).pow(2).sum()).item() metrics['entropy'] = self.actor.entropy(rollouts['states']).mean().item() metrics['kl'] = kl_div.item() metrics['pg_norm'] = pg_norm return metrics ################################################################ ########################## Utilities ########################### ################################################################ def clipped_policy_objective(self, old_log_pi, log_pi, adv): ratio = torch.exp(log_pi - old_log_pi) loss = ratio * adv clipped_loss = torch.clamp(ratio, 1 - self.param.CLIP, 1 + self.param.CLIP) * adv return -torch.min(loss, clipped_loss).mean() def clipped_value_loss(self, old_val, val, ret): loss = (val - ret).pow(2) clipped_loss = ((old_val + torch.clamp(val - old_val, -self.param.CLIP, self.param.CLIP)) - ret).pow(2) return torch.max(loss, clipped_loss).mean() def data_generator(self, rollouts): if self.param.NUM_MINI_BATCHES > 0: mini_batch_size = self.param.BATCH_SIZE // self.param.NUM_MINI_BATCHES random_sampler = SubsetRandomSampler(range(self.param.BATCH_SIZE)) batch_sampler = BatchSampler(random_sampler, mini_batch_size, drop_last=True) for indices in batch_sampler: s = rollouts['states'][indices] a = rollouts['actions'][indices] ret = rollouts['returns_gae'][indices] val = rollouts['values'][indices] pi = rollouts['log_probs'][indices] adv = rollouts['advantages'][indices] yield s, a, ret, val, pi, adv else: s = rollouts['states'] a = rollouts['actions'] ret = rollouts['returns_gae'] val = rollouts['values'] pi = rollouts['log_probs'] adv = rollouts['advantages'] yield s, a, ret, val, pi, adv
def __init__( self, env, policy=None, # Learning models nets_hidden_sizes=(64, 64), nets_nonlinear_op='relu', use_q2=True, explicit_vf=False, # RL algorithm behavior total_episodes=10, train_steps=100, eval_rollouts=10, max_horizon=100, fixed_horizon=True, # Target models update soft_target_tau=5e-3, target_update_interval=1, # Replay Buffer replay_buffer_size=1e6, batch_size=64, discount=0.99, # Optimization optimization_steps=1, optimizer='adam', optimizer_kwargs=None, policy_lr=3e-4, qf_lr=3e-4, policy_weight_decay=1.e-5, q_weight_decay=1.e-5, # Entropy entropy_scale=1., auto_alpha=True, max_alpha=10, min_alpha=0.01, tgt_entro=None, # Others norm_input_pol=False, norm_input_vfs=False, seed=610, render=False, gpu_id=-1, ): """Soft Actor-Critic algorithm. Args: env (gym.Env): OpenAI-Gym-like environment with multigoal option. policy (torch.nn.module): A pytorch stochastic Gaussian Policy nets_hidden_sizes (list or tuple of int): Number of units in hidden layers for all the networks. use_q2 (bool): Use two parameterized Q-functions. explicit_vf (bool): total_episodes (int): train_steps (int): eval_rollouts (int): max_horizon (int): fixed_horizon (bool): soft_target_tau (float): target_update_interval (int): replay_buffer_size (int): batch_size (int): discount (float): optimization_steps (int): optimizer (str): optimizer_kwargs (dict): policy_lr (float): qf_lr (float): policy_weight_decay (float): q_weight_decay (float): entropy_scale (float): auto_alpha (int): max_alpha (float): min_alpha (float): tgt_entro (float): norm_input_pol (bool): norm_input_vfs (bool): seed (int): render (bool): gpu_id (int): """ self.seed = seed np.random.seed(seed) torch.cuda.manual_seed(seed) torch.manual_seed(seed) self.env = env self.env.seed(seed) # Algorithm hyperparameters self.obs_dim = np.prod(env.observation_space.shape).item() self.action_dim = np.prod(env.action_space.shape).item() self.total_episodes = total_episodes self.train_steps = train_steps self.eval_rollouts = eval_rollouts self.max_horizon = max_horizon self.fixed_horizon = fixed_horizon self.render = render self.discount = discount self.soft_target_tau = soft_target_tau self.target_update_interval = target_update_interval self.norm_input_pol = norm_input_pol self.norm_input_vfs = norm_input_vfs # Policy Network if policy is None: self.policy = GaussianPolicy( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_pol, ) else: self.policy = policy # Value Function Networks self.qf1 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) if use_q2: self.qf2 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) else: self.qf2 = None if explicit_vf: self.vf = VFunction( self.obs_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_vf = VFunction( self.obs_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_vf.load_state_dict(self.vf.state_dict()) self.target_vf.eval() self.target_qf1 = None self.target_qf2 = None else: self.vf = None self.target_vf = None self.target_qf1 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf1.eval() if use_q2: self.target_qf2 = QFunction( self.obs_dim, self.action_dim, nets_hidden_sizes, non_linear=nets_nonlinear_op, final_non_linear='linear', batch_norm=False, input_normalization=norm_input_vfs, ) self.target_qf2.load_state_dict(self.qf2.state_dict()) self.target_qf2.eval() else: self.target_qf2 = None # Replay Buffer self.replay_buffer = ReplayBuffer( max_size=int(replay_buffer_size), obs_dim=self.obs_dim, action_dim=self.action_dim, ) self.batch_size = batch_size # Move models to GPU self.torch_device = \ torch.device("cuda:" + str(gpu_id) if gpu_id >= 0 else "cpu") for model in self.trainable_models + self.non_trainable_models: model.to(device=self.torch_device) # Ensure non trainable models have fixed parameters for model in self.non_trainable_models: model.eval() # # TODO: Should we also set its parameters to requires_grad=False? # for param in model.parameters(): # param.requires_grad = False # ###### # # Alphas # # ###### # self.entropy_scale = torch.tensor(entropy_scale, device=self.torch_device) if tgt_entro is None: tgt_entro = -self.action_dim self.tgt_entro = torch.tensor(tgt_entro, device=self.torch_device) self._auto_alpha = auto_alpha self.max_alpha = max_alpha self.min_alpha = min_alpha self.log_alpha = torch.zeros(1, device=self.torch_device, requires_grad=True) # ########## # # Optimizers # # ########## # self.optimization_steps = optimization_steps if optimizer.lower() == 'adam': optimizer_class = torch.optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = torch.optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Values optimizer qvals_params = self.qf1.parameters() if self.qf2 is not None: qvals_params = chain(qvals_params, self.qf2.parameters()) self.qvalues_optimizer = optimizer_class(qvals_params, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) if self.vf is not None: self.vvalues_optimizer = optimizer_class( self.vf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) else: self.vvalues_optimizer = None # Policy optimizer self._policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizers self._alphas_optimizer = optimizer_class([self.log_alpha], lr=policy_lr, **optimizer_kwargs) # Internal variables self.num_train_interactions = 0 self.num_train_steps = 0 self.num_eval_interactions = 0 self.num_episodes = 0 # Log variables self.logging_qvalues_error = 0 self.logging_vvalues_error = 0 self.logging_policies_error = 0 self.logging_entropy = torch.zeros(self.batch_size) self.logging_mean = torch.zeros((self.batch_size, self.action_dim)) self.logging_std = torch.zeros((self.batch_size, self.action_dim)) self.logging_eval_rewards = torch.zeros(self.eval_rollouts) self.logging_eval_returns = torch.zeros(self.eval_rollouts)
class SAC: MAX_EXPERIENCES = 100000 MIN_EXPERIENCES = 512 UPDATE_PERIOD = 4 GAMMA = 0.99 TAU = 0.005 BATCH_SIZE = 256 def __init__(self, env_id, action_space, action_bound): self.env_id = env_id self.action_space = action_space self.action_bound = action_bound self.env = gym.make(self.env_id) self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES) self.policy = GaussianPolicy(action_space=self.action_space, action_bound=self.action_bound) self.duqlqnet = DualQNetwork() self.target_dualqnet = DualQNetwork() self.log_alpha = tf.Variable(0.) #: alpha=1 self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4) self.target_entropy = -0.5 * self.action_space self.global_steps = 0 self._initialize_weights() def _initialize_weights(self): """1度callすることでネットワークの重みを初期化 """ env = gym.make(self.env_id) dummy_state = env.reset() dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.action_space) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.policy(dummy_state) self.duqlqnet(dummy_state, dummy_action) self.target_dualqnet(dummy_state, dummy_action) self.target_dualqnet.set_weights(self.duqlqnet.get_weights()) def play_episode(self): episode_reward = 0 local_steps = 0 done = False state = self.env.reset() while not done: action, _ = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) state = next_state episode_reward += reward local_steps += 1 self.global_steps += 1 if (len(self.replay_buffer) >= self.MIN_EXPERIENCES and self.global_steps % self.UPDATE_PERIOD == 0): self.update_networks() return episode_reward, local_steps, tf.exp(self.log_alpha) def update_networks(self): (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.BATCH_SIZE) alpha = tf.math.exp(self.log_alpha) #: Update Q-function next_actions, next_logprobs = self.policy.sample_action(next_states) target_q1, target_q2 = self.target_dualqnet(next_states, next_actions) target = rewards + (1 - dones) * self.GAMMA * ( tf.minimum(target_q1, target_q2) + -1 * alpha * next_logprobs ) with tf.GradientTape() as tape: q1, q2 = self.duqlqnet(states, actions) loss_1 = tf.reduce_mean(tf.square(target - q1)) loss_2 = tf.reduce_mean(tf.square(target - q2)) loss = 0.5 * loss_1 + 0.5 * loss_2 variables = self.duqlqnet.trainable_variables grads = tape.gradient(loss, variables) self.duqlqnet.optimizer.apply_gradients(zip(grads, variables)) #: Update policy with tf.GradientTape() as tape: selected_actions, logprobs = self.policy.sample_action(states) q1, q2 = self.duqlqnet(states, selected_actions) q_min = tf.minimum(q1, q2) loss = -1 * tf.reduce_mean(q_min + -1 * alpha * logprobs) variables = self.policy.trainable_variables grads = tape.gradient(loss, variables) self.policy.optimizer.apply_gradients(zip(grads, variables)) #: Adjust alpha entropy_diff = -1 * logprobs - self.target_entropy with tf.GradientTape() as tape: tape.watch(self.log_alpha) selected_actions, logprobs = self.policy.sample_action(states) alpha_loss = tf.reduce_mean(tf.exp(self.log_alpha) * entropy_diff) grad = tape.gradient(alpha_loss, self.log_alpha) self.alpha_optimizer.apply_gradients([(grad, self.log_alpha)]) #: Soft target update self.target_dualqnet.set_weights( (1 - self.TAU) * np.array(self.target_dualqnet.get_weights()) + self.TAU * np.array(self.duqlqnet.get_weights()) ) def save_model(self): self.policy.save_weights("checkpoints/actor") self.duqlqnet.save_weights("checkpoints/critic") def load_model(self): self.policy.load_weights("checkpoints/actor") self.duqlqnet.load_weights("checkpoints/critic") self.target_dualqnet.load_weights("checkpoints/critic") def testplay(self, n=1, monitordir=None): if monitordir: env = wrappers.Monitor(gym.make(self.env_id), monitordir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_id) total_rewards = [] for _ in range(n): state = env.reset() done = False total_reward = 0 while not done: action, _ = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = env.step(action) total_reward += reward if done: break else: state = next_state total_rewards.append(total_reward) print() print(total_reward) print() return total_rewards
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = Buffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) _, _, next_zs, next_log_pi = self.policy_net.sample(next_states) next_actions = torch.tanh(next_zs) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() _, _, new_zs, log_pi = self.policy_net.sample(states) new_actions = torch.tanh(new_zs) min_q = torch.min( self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions) ) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
def __init__( self, state_shape, action_dim, max_action, save_freq, discount=0.99, tau=0.005, actor_freq=2, lr=3e-4, entropy_tune=False, seed=0, ): self.rng = PRNGSequence(seed) actor_input_dim = [((1, *state_shape), jnp.float32)] critic_input_dim = self.critic_input_dim = [ ((1, *state_shape), jnp.float32), ((1, action_dim), jnp.float32), ] self.actor = None self.critic = None self.log_alpha = None self.entropy_tune = entropy_tune self.target_entropy = -action_dim self.adam = Optimizers( actor=optim.Adam(learning_rate=lr), critic=optim.Adam(learning_rate=lr), log_alpha=optim.Adam(learning_rate=lr), ) self.module = Modules( actor=GaussianPolicy.partial(action_dim=action_dim, max_action=max_action), critic=DoubleCritic.partial(), alpha=Constant.partial(start_value=1), ) self.optimizer = None self.max_action = max_action self.discount = discount self.tau = tau self.policy_freq = actor_freq self.save_freq = save_freq self.total_it = 0 self.model = None def new_params(module: nn.Module, shape=None): _, params = (module.init(next(self.rng)) if shape is None else module.init_by_shape(next(self.rng), shape)) return params def new_model(module: nn.Module, shape=None) -> nn.Model: return nn.Model(module, new_params(module, shape)) def update_model(model: nn.Model, shape=None) -> nn.Model: return model.replace(params=new_params(model.module, shape)) def reset_models() -> Models: if self.model is None: critic = new_model(self.module.critic, critic_input_dim) return Models( actor=new_model(self.module.actor, actor_input_dim), critic=critic, target_critic=critic.replace(params=critic.params), alpha=new_model(self.module.alpha), ) else: critic = update_model(self.model.critic, critic_input_dim) return Models( actor=update_model(self.model.actor, actor_input_dim), critic=critic, target_critic=critic.replace(params=critic.params), alpha=update_model(self.model.alpha), ) self.reset_models = reset_models def reset_optimizer(adam: Adam, model: nn.Model) -> Optimizer: return jax.device_put(adam.create(model)) def reset_optimizers() -> Optimizers: return Optimizers( actor=reset_optimizer(self.adam.actor, self.model.actor), critic=reset_optimizer(self.adam.critic, self.model.critic), log_alpha=reset_optimizer(self.adam.log_alpha, self.model.alpha), ) self.reset_optimizers = reset_optimizers self.i = 0