def __init__(self, device, env, hyperparams): self.device = device self.env = env brain_name = env.brain_names[0] brain = env.brains[brain_name] self.action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations self.state_size = states.shape[1] # hyperparameters self.gamma = hyperparams["gamma"] self.tau = hyperparams["tau"] self.update_step = hyperparams.get("update_step", 0) self.delay_step = hyperparams.get("delay_step", 2) # initialize networks self.q_net1 = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.q_net2 = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.target_q_net1 = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.target_q_net2 = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.policy_net = model.GaussianPolicyNetwork( self.state_size, self.action_size, hyperparams).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers q_learn_rate = hyperparams["q_learn_rate"] policy_learn_rate = hyperparams["policy_learn_rate"] self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_learn_rate) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_learn_rate) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_learn_rate) # entropy temperature self.alpha = hyperparams["alpha"] a_learn_rate = hyperparams["a_learn_rate"] self.target_entropy = -brain.vector_action_space_size self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_learn_rate) self.replay_buffer = buffer.SimpleBuffer(self.device, 0, hyperparams)
def __init__(self, device, env, hyperparams): self.device = device self.env = env brain_name = env.brain_names[0] brain = env.brains[brain_name] self.action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations self.state_size = states.shape[1] # hyperparameters self.gamma = hyperparams["gamma"] self.tau = hyperparams["tau"] self.update_step = hyperparams.get("update_step", 0) self.delay_step = hyperparams.get("delay_step", 2) # initialize networks self.q_net = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.target_q_net = model.QNetwork(self.state_size, self.action_size, hyperparams).to(self.device) self.policy_net = model.DeterministicPolicyNetwork( self.state_size, self.action_size, hyperparams).to(self.device) self.target_policy_net = model.DeterministicPolicyNetwork( self.state_size, self.action_size, hyperparams).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net.parameters(), self.q_net.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param) # initialize optimizers q_learn_rate = hyperparams["q_learn_rate"] policy_learn_rate = hyperparams["policy_learn_rate"] self.q_optimizer = optim.Adam(self.q_net.parameters(), lr=q_learn_rate) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_learn_rate) self.replay_buffer = buffer.SimpleBuffer(self.device, 0, hyperparams)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = m.QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = m.QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0