def __init__(self, device, env, hyperparams):

        self.device = device
        self.env = env
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        self.action_size = brain.vector_action_space_size
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        self.state_size = states.shape[1]

        # hyperparameters
        self.gamma = hyperparams["gamma"]
        self.tau = hyperparams["tau"]
        self.update_step = hyperparams.get("update_step", 0)
        self.delay_step = hyperparams.get("delay_step", 2)

        # initialize networks
        self.q_net1 = model.QNetwork(self.state_size, self.action_size,
                                     hyperparams).to(self.device)
        self.q_net2 = model.QNetwork(self.state_size, self.action_size,
                                     hyperparams).to(self.device)
        self.target_q_net1 = model.QNetwork(self.state_size, self.action_size,
                                            hyperparams).to(self.device)
        self.target_q_net2 = model.QNetwork(self.state_size, self.action_size,
                                            hyperparams).to(self.device)
        self.policy_net = model.GaussianPolicyNetwork(
            self.state_size, self.action_size, hyperparams).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        q_learn_rate = hyperparams["q_learn_rate"]
        policy_learn_rate = hyperparams["policy_learn_rate"]
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=q_learn_rate)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=q_learn_rate)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_learn_rate)

        # entropy temperature
        self.alpha = hyperparams["alpha"]
        a_learn_rate = hyperparams["a_learn_rate"]
        self.target_entropy = -brain.vector_action_space_size
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_learn_rate)

        self.replay_buffer = buffer.SimpleBuffer(self.device, 0, hyperparams)
    def __init__(self, device, env, hyperparams):

        self.device = device
        self.env = env
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        self.action_size = brain.vector_action_space_size
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        self.state_size = states.shape[1]

        # hyperparameters
        self.gamma = hyperparams["gamma"]
        self.tau = hyperparams["tau"]
        self.update_step = hyperparams.get("update_step", 0)
        self.delay_step = hyperparams.get("delay_step", 2)

        # initialize networks
        self.q_net = model.QNetwork(self.state_size, self.action_size,
                                    hyperparams).to(self.device)
        self.target_q_net = model.QNetwork(self.state_size, self.action_size,
                                           hyperparams).to(self.device)
        self.policy_net = model.DeterministicPolicyNetwork(
            self.state_size, self.action_size, hyperparams).to(self.device)
        self.target_policy_net = model.DeterministicPolicyNetwork(
            self.state_size, self.action_size, hyperparams).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net.parameters(),
                                       self.q_net.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_policy_net.parameters(),
                                       self.policy_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        q_learn_rate = hyperparams["q_learn_rate"]
        policy_learn_rate = hyperparams["policy_learn_rate"]
        self.q_optimizer = optim.Adam(self.q_net.parameters(), lr=q_learn_rate)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_learn_rate)

        self.replay_buffer = buffer.SimpleBuffer(self.device, 0, hyperparams)
Exemple #3
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = m.QNetwork(state_size, action_size,
                                         seed).to(device)
        self.qnetwork_target = m.QNetwork(state_size, action_size,
                                          seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0