Esempio n. 1
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        # agents memory
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
Esempio n. 2
0
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr =lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name=f'{self.env_name}_{self.algo}_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name=f'{self.env_name}_{self.algo}_q_next',
                                   chkpt_dir=self.chkpt_dir)
Esempio n. 3
0
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size,
                 batch_size, chkpt_name, eps_min, eps_dec, replace,
                 logging_dir):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace

        self.chkpt_dir = chkpt_name
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = FullyConnectedNet(self.lr,
                                        self.n_actions,
                                        input_dims=self.input_dims,
                                        chkpt_name=self.chkpt_dir,
                                        name='q_eval',
                                        logging_dir=logging_dir)

        self.q_next = FullyConnectedNet(self.lr,
                                        self.n_actions,
                                        input_dims=self.input_dims,
                                        name='q_next',
                                        chkpt_name=self.chkpt_dir,
                                        logging_dir=logging_dir)
Esempio n. 5
0
 def __init__(self, input_dims, n_actions):
     self.epsilon = Config.epsilon
     self.n_actions = n_actions
     self.input_dims = input_dims
     self.action_space = [i for i in range(n_actions)]
     self.learn_step_counter = 0
     self.memory = ReplayBuffer(input_dims, n_actions)
Esempio n. 6
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 device=device,
                 epsilon=0.3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.epsilon = epsilon

        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = DDPGActor(state_size, action_size, seed).to(device)
        self.actor_target = DDPGActor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = D4PGCritic(state_size, action_size, seed, N_ATOMS,
                                       Vmin, Vmax).to(device)
        self.critic_target = D4PGCritic(state_size, action_size, seed, N_ATOMS,
                                        Vmin, Vmax).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
Esempio n. 7
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims)
Esempio n. 8
0
    def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=128, gamma=0.99,
                 learning_rate = 1e-3, target_tau=1e-3, update_rate=4, seed=0):
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0

        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network =  QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr= self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed)

        # Initialize time step ( for updating every UPDATE_EVERY steps)
        self.t_step = 0
Esempio n. 9
0
    def __init__(self):
        self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet()
        self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet()
        # sync evac target
        self.target_deque1 = deque(maxlen=n)
        self.target_deque2 = deque(maxlen=n)
        self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0)
        self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0)

        self.target_deque1.append(self.target_net_Q1)
        # use gpu
        if USE_GPU:
            self.pred_net_Q1.cuda()
            self.target_net_Q1.cuda()
            self.pred_net_Q2.cuda()
            self.target_net_Q2.cuda()
        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        # loss function
        self.loss_function = nn.MSELoss()
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR)

        self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(),
                                           lr=LR)
Esempio n. 10
0
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, action_joint_dim,
                 mem_size, batch_size, eps_min, eps_dec, replace,
                 prioritized=False, prob_alpha=0.6, beta=0.4, beta_increment=1e-4, 
                 temperature = 0.1, tau = 1e-5):

        """

        Double Deep Q-Learning Agent class.

        -----

        Args:
            gamma: Discount factor for reward. 0 indicates a myopic behaviour. 1 indicates a far-sighted behaviour.
            epsilon: Exploration/exploitation rate. 0 indicates full exploitation.
            lr: Learning Rate. The bigger 'lr' the bigger step in the gradient of the loss.
            n_actions: Number of possible actions.
            input_dims: Dimension of the state (allegedly an image). The channel goes first (CHANN, HEIGHT, WIDTH)
            action_joint_dim: Number of joints for the Multi-agent case. Normally the number of agents.
            mem_size: Number of the Replay Buffer memory.
            batch_size: Number of past experiences used for trainin Q-Network.
            eps_min: Min. value for the exploration.
            eps_dec: Epsilon decay in every epoch.
            replace: Number of epochs for replacing the target network with the behavioral network.

        ------
        """
        
        # Hiperparámetros de entrenamiento #
        self.gamma = gamma
        self.epsilon = epsilon
        self.beta = beta
        self.beta_increment = beta_increment
        self.prob_alpha = prob_alpha
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.update_target_count = replace
        self.action_space = [i for i in range(n_actions)]
        self.action_joint_dim = action_joint_dim
        self.prioritized = prioritized
        self.temperature = temperature
        self.tau = tau
        self.mem_size = mem_size

        if not self.prioritized:
            self.memory = ReplayBuffer(mem_size, input_dims, action_joint_dim)
        else:
            self.memory = PrioritizedReplayBuffer(mem_size, input_dims, action_joint_dim, self.prob_alpha)
        
        # Funciones del modelo y del target #
        
        self.q_eval = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims)
        self.q_eval.cuda()
        
        self.q_next = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims)
        self.q_next.cuda()
Esempio n. 11
0
    def __init__(self, state_size, action_size, num_agents):
        self.policy = PolicyNetwork(state_size, action_size).to(device)
        self.old_policy = PolicyNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)

        self.episodes = [Episode() for _ in range(num_agents)]
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.t_step = 0
Esempio n. 12
0
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.batch_size = batch_size
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def choose_action(self, observation):
        raise NotImplementedError

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min
    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
Esempio n. 13
0
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate,
                                    betas=BETAS)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Esempio n. 14
0
    def __init__(self, state_size, action_size, num_agents, double_dqn=True):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0
Esempio n. 15
0
File: dqn.py Progetto: xuezzee/-
    def __init__(self, state_size, action_size, num_agents, double_dqn=False):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0
    def __init__(self,
                 lr: float,
                 gamma: float,
                 obs_dims,
                 num_actions: int,
                 mem_size,
                 mini_batchsize,
                 epsilon_dec,
                 env_name,
                 algo_name,
                 epsilon=1.0,
                 replace=1000,
                 epsilon_min=0.1,
                 checkpoint_dir='results\\doubledqn'):

        self.lr = lr
        self.gamma = gamma
        self.obs_dims = obs_dims
        self.num_actions = num_actions
        self.mini_batchsize = mini_batchsize
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.epsilon = epsilon
        self.replace_target_cnt = replace

        self.mem_counter = 0
        self.copy_counter = 0
        self.checkpoint_dir = checkpoint_dir
        self.memories = ReplayBuffer(mem_size=mem_size,
                                     state_shape=self.obs_dims,
                                     num_actions=self.num_actions)
        self.action_space = [i for i in range(self.num_actions)]

        self.learning_network = DeepQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=algo_name + '_' + env_name + '_' + 'learning',
            checkpoint_dir=self.checkpoint_dir)

        self.target_network = DeepQNetwork(lr=self.lr,
                                           num_actions=self.num_actions,
                                           input_dims=self.obs_dims,
                                           name=env_name + '_' + algo_name +
                                           '_target',
                                           checkpoint_dir=self.checkpoint_dir)

        self.loss_value = 0
        self.writer = SummaryWriter(os.path.join(self.checkpoint_dir, 'logs'))
Esempio n. 17
0
    def __init__(self, n_actions, input_dims):
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.epsilon = Config.epsilon
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(input_dims, n_actions)
        name_root = Config.env_name + '_' + Config.algo
        self.q_eval = Network(self.n_actions,
                              input_dims=self.input_dims,
                              name=name_root + '_q_eval')
        self.q_next = Network(self.n_actions,
                              input_dims=self.input_dims,
                              name=name_root + '_q_next')
Esempio n. 18
0
    def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \
                    n_actions=2, max_size=1000000, layer1_size=400, \
                    layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name='_actor',
                                  max_action=max_action,
                                  chkpt_dir=path_dir)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_1',
                                      chkpt_dir=path_dir)
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_2',
                                      chkpt_dir=path_dir)
        self.value = ValueNetwork(beta,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  name='_value',
                                  chkpt_dir=path_dir)
        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         name='_target_value',
                                         chkpt_dir=path_dir)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
Esempio n. 19
0
    def __init__(self,
                 lr: float,
                 gamma: float,
                 obs_dims,
                 num_actions: int,
                 mem_size,
                 mini_batchsize,
                 epsilon_dec,
                 env_name,
                 algo_name,
                 epsilon=1.0,
                 replace=1000,
                 epsilon_min=0.1,
                 checkpoint_dir='temp/dqn/duelingdqn'):

        self.lr = lr
        self.gamma = gamma
        self.obs_dims = obs_dims
        self.num_actions = num_actions
        self.mini_batchsize = mini_batchsize
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.epsilon = epsilon

        self.mem_counter = 0
        self.copy_counter = 0
        self.replace_target_cnt = replace
        self.checkpoint_dir = checkpoint_dir
        self.memories = ReplayBuffer(mem_size=mem_size,
                                     state_shape=self.obs_dims,
                                     num_actions=self.num_actions)
        self.action_space = [i for i in range(self.num_actions)]

        self.learning_network = DuelingQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=env_name + '_' + algo_name + '_learning',
            checkpoint_dir=self.checkpoint_dir)

        self.target_network = DuelingQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=env_name + '_' + algo_name + '_target',
            checkpoint_dir=self.checkpoint_dir)
Esempio n. 20
0
    def __init__(self, states, actions, alpha, gamma, epsilon, epsilon_min,
                 epsilon_decay, replay_buffer_sz, batch, path, path_pred):
        self.Q = Network(states.shape, actions, alpha, path)
        self.Q_pred = Network(states.shape, actions, alpha, path_pred)

        # self.memory = deque(maxlen=replay_buffer_sz)
        self.memory = ReplayBuffer(replay_buffer_sz, states.shape, actions)
        self.batch = batch
        self.learn_cnt = 0

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.actions = actions
        self.Q.path = path
        self.Q_pred.path = path_pred
Esempio n. 21
0
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync evac target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
Esempio n. 22
0
class Agent():
    def __init__(self, input_dims, n_actions):
        self.epsilon = Config.epsilon
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(input_dims, n_actions)

    def store_transition(self, state, action, reward, state_new, done):
        self.memory.store_transition(state, action, reward, state_new, done)

    def choose_action(self, observation):
        raise NotImplementedError

    def replace_target_network(self):
        if self.learn_step_counter % Config.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon * Config.eps_decay, Config.eps_min)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer()

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
Esempio n. 23
0
def replay_buffer():
    """
    """
    action_size = 10
    buffer_size = 20
    batch_size = 5
    seed = 42
    device = "cpu"
    rebuf = ReplayBuffer(action_size, buffer_size, batch_size, seed, device)
    return rebuf
Esempio n. 24
0
    def __init__(self,
                 input_dims,
                 n_actions,
                 lr,
                 mem_size,
                 batch_size,
                 epsilon,
                 gamma=0.99,
                 eps_dec=5e-7,
                 eps_min=0.01,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.lr = lr
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.replace = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 chkpt_dir,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None):
        self.gamma = gamma  # 0.99
        self.epsilon = epsilon  # 1.0
        self.lr = lr  # 0.0001
        self.n_actions = n_actions  # 6
        self.input_dims = input_dims  # (4, 84, 84)
        self.batch_size = batch_size  # 32
        self.eps_min = eps_min  # 0.1
        self.eps_dec = eps_dec  # 1e-05
        self.replace_target_cnt = replace  # 1000
        self.algo = algo  # 'DQNAgent'
        self.env_name = env_name  #  'PongNoFrameskip-v4'
        self.chkpt_dir = chkpt_dir  #  .\\models\\
        self.action_space = [i for i in range(self.n_actions)
                             ]  # [0, 1, 2, 3, 4, 5]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)
Esempio n. 26
0
    def agent_init(self, agent_config):
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'],
                                          agent_config['minibatch_sz'],
                                          agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        self.optimizer = Adam(self.network.layer_sizes,
                              agent_config["optimizer_config"])
        self.num_actions = agent_config['network_config']['num_actions']
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        self.tau = agent_config['tau']

        self.rand_generator = np.random.RandomState(agent_config.get("seed"))

        self.last_state = None
        self.last_action = None

        self.sum_rewards = 0
        self.episode_steps = 0
Esempio n. 27
0
 def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=.01, eps_dec=5e-7,
              replace_count=1000, algorithm=None, env_name=None, checkpoint_dir='/checkpoints'):
     self.gamma = gamma
     self.epsilon = epsilon
     self.lr = lr
     self.n_actions = n_actions
     self.input_dims = input_dims
     self.batch_size = batch_size
     self.eps_min = eps_min
     self.eps_dec = eps_dec
     self.replace_count = replace_count
     self.algorithm = algorithm
     self.env_name = env_name
     self.checkpoint_dir = checkpoint_dir
     self.action_space = [i for i in range(self.n_actions)]
     self.learn_step_counter = 0
     self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
     print(type(self).__name__)
     self.q_eval = object
     self.q_policy = object
Esempio n. 28
0
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):

        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two neural network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Esempio n. 29
0
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)

        # discrete values
        self.value_range = torch.FloatTensor(V_RANGE)  # (N_ATOM)
        if USE_GPU:
            self.value_range = self.value_range.cuda()
Esempio n. 30
0
    def __init__(self):
        if USE_CNN:
            if USE_GPU:
                self.eval_net, self.target_net = ConvNet().cuda(), ConvNet(
                ).cuda()
            else:
                self.eval_net, self.target_net = ConvNet(), ConvNet()
        else:
            if USE_GPU:
                self.eval_net, self.target_net = Net().cuda(), Net().cuda()
            else:
                self.eval_net, self.target_net = Net(), Net()

        self.learn_step_counter = 0  # for target updating
        self.memory_counter = 0

        # Create the replay buffer
        if MEMORY_MODE == 'PER':
            self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY,
                                                         alpha=PER_ALPHA)
        else:
            self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)