Ejemplo n.º 1
0
 def __init__(self,
              state_shape,
              action_shape,
              learning_rate=0.005,
              gamma=0.98,
              memory=Memory(capacity=2000)):
     self.state_shape = state_shape
     self.action_shape = action_shape
     self.gamma = gamma  # Agent's discount factor
     self.learning_rate = learning_rate  # Agent's Q-learning rate
     # self.Q is the Action-Value function. This agent represents Q using a
     # Neural Network.
     print(self.state_shape, self.action_shape)
     self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape,
                                     0.01, 0.01)
     self.tQ = DQNAgent().build_model(self.state_shape[0],
                                      self.action_shape, 0.01, 0.01)
     # self.policy is the policy followed by the agent. This agents follows
     # an epsilon-greedy policy w.r.t it's Q estimate.
     self.policy = self.epsilon_greedy_Q
     self.epsilon_max = 1.0
     self.epsilon_min = 0.05
     self.epsilon_decay = LinearDecaySchedule(
         initial_value=self.epsilon_max,
         final_value=self.epsilon_min,
         max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE)
     self.step_num = 0
     self.update_steps = 64
     #self.memory =deque(maxlen=2000)
     self.memory = memory
Ejemplo n.º 2
0
    def __init__(self, obs_shape, action_shape, hidden_shape, params):

        self.params = params
        self.gamma = self.params["gamma"]
        self.delta = self.params["delta"]
        self.learning_rate = self.params["learning_rate"]
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        self.action_shape = action_shape

        self.Q = CNN(obs_shape, action_shape, hidden_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = self.params["epsilon_max"]
        self.epsilon_min = self.params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params["epsilon_decay_final_step"])

        self.memory = ExperienceMemory(self.params["memory"])

        self.total_trainings = 0
        self.step_num = 0
Ejemplo n.º 3
0
    def __init__(self, state_shape, action_shape, params):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['lr']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0

        if len(self.state_shape) == 1:
            self.DQN = SLP
        elif len(self.state_shape) == 3:
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q.apply(utils.weights_initializer.xavier)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params["epsilon_max"]
        self.epsilon_min = params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0
        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']))
    def __init__(self, state_shape, action_shape, params):
        """
        self.Q is the Action-Value function. This agent represents Q using a Neural Network
        If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional
        image, use a Convolutional-Neural-Network

        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters
        """
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']  # Agent's discount factor
        self.learning_rate = self.params['lr']  # Agent's Q-learning rate
        self.best_mean_reward = -float(
            "inf")  # Agent's personal best mean episode reward
        self.best_reward = -float("inf")
        self.training_steps_completed = 0  # Number of training batch steps completed so far

        if len(self.state_shape
               ) == 1:  # Single dimensional observation/state space
            self.DQN = SLP
        elif len(self.state_shape) == 3:  # 3D/image observation/state
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q.apply(utils.weights_initializer.xavier)

        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params["epsilon_max"]
        self.epsilon_min = params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']
                         ))  # Initialize an Experience memory with 1M capacity
Ejemplo n.º 5
0
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape
        self.action_shape = environment.action_space.n
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.Q = SLP(self.obs_shape, self.action_shape, self.device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE)

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
        self.memory = ExperienceMemory(capacity=int(1e5))
Ejemplo n.º 7
0
 def __init__(self,
              state_shape,
              action_shape,
              learning_rate=0.005,
              gamma=0.98):
     self.state_shape = state_shape
     self.action_shape = action_shape
     self.gamma = gamma
     self.learning_rate = learning_rate
     self.Q = SLP(state_shape, action_shape)
     self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3)
     self.policy = self.epsilon_greedy_Q
     self.epsilon_max = 1.0
     self.epsilon_min = 0.05
     self.epsilon_decay = LinearDecaySchedule(
         initial_value=self.epsilon_max,
         final_value=self.epsilon_min,
         max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE)
     self.step_num = 0
Ejemplo n.º 8
0
    def __init__(self, obs_shape, action_shape, params):

        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['learning_rate']
        self.best_reward_mean = -float("inf")
        self.best_mean = -float("inf")
        self.training_steps_completes = 0

        self.epsilon_max = self.params['epsilon_max']
        self.epsilon_min = self.params['epsilon_min']
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_step=self.params['epsilon_decay_final_step'])

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q

        if len(self.obs_shape
               ) == 1:  ## Solo se existe 1D en el espacio de observaciones
            self.DQN = SLP

        elif len(
                self.obs_shape
        ) == 3:  ## El estado de observaciones es una imagen o un objeto 3D
            self.DQN = CNN

        self.Q = self.DQN(obs_shape, action_shape, device).to(device)
        self.Q_Optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        if self.params['use_target_network']:
            self.Q_target = self.DQN(obs_shape, action_shape,
                                     device).to(device)

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_size']))