def __init__(self, env_name, gamma, nstep, target_update_period, n_frames):

        self.env_name = env_name

        self.gamma = gamma

        self.nstep = nstep

        self.action_space = gym.make(env_name).action_space.n

        self.qnet = DuelingQNetwork(action_space=self.action_space)

        self.target_qnet = DuelingQNetwork(action_space=self.action_space)

        self.target_update_period = target_update_period

        self.n_frames = n_frames

        #self.optimizer = tf.keras.optimizers.Adam(lr=0.0001)

        self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.00025 / 4,
                                                     rho=0.95,
                                                     momentum=0.0,
                                                     epsilon=1.5e-07,
                                                     centered=True)

        self.update_count = 0
Exemple #2
0
    def __init__(self, state_size, action_size, seed, use_is=True):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            use_is: flag indicating whether to use importance sampling when computing the sampling probabilities
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = PrioritizedExperienceReplayBuffer(
            action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.use_is = use_is
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        # Use the optim package to define an Optimizer that will update the weights of
        # the model for us. Here we will use Adam; the optim package contains many other
        # optimization algorithms.

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #4
0
    def __init__(self, state_size, action_size, params, seed=None, model='dqn'):
        self.seed = seed
        if seed:
            random.seed(seed)
            np.random.seed(0)

        self.params = params
        self.state_size = state_size
        self.action_size = action_size
        self.eps = self.params['EPS']
        
        # Memmory to learn from.
        self.memory = ReplayBuffer(memory_size=self.params['BUFFER_SIZE'], sample_size=self.params['BATCH_SIZE'])

        # Network
        if model == 'dqn':
            # Vanilla DQN
            self.target = QNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device)
            self.local = QNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device)

        elif model == 'ddqn':
            # Dueling DQN
            self.target = DuelingQNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device)
            self.local = DuelingQNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device)

        self.optimizer = torch.optim.Adam(self.local.parameters(), lr=self.params['LR'])
       
        self.t_step = 0
Exemple #5
0
    def __init__(self, state_size, action_size, num_episodes, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            num_episodes (int): number of training epochs
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.anneal_beta = (1. - BETA) / num_episodes

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.t_learning_step = 0
Exemple #6
0
    def __init__(self, state_size, action_size, double_dqn, dueling, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            double_qn (bool): true if double dqn else false
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = double_dqn
        self.seed = random.seed(seed)
        self.dueling = dueling

        # Q-Network
        if dueling:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)

        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #7
0
    def __init__(self, state_size, action_size, seed):  #, writer):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # TODO: Swap ReplayBuffer for PER buffer
        # Replay memory
        #         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.memory = PrioritisedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, ALPHA, EPSILON)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.beta = BETA_START
Exemple #8
0
    def __init__(self, state_size, action_size, seed, use_double_dqn,
                 use_dueling_dqn):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.use_double_dqn = use_double_dqn

        if use_dueling_dqn:
            # Dueling Q-Network
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)
        else:
            # Q-Network
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_NETWORK_EVERY steps)
        self.t_step = 0
Exemple #9
0
    def __init__(self, state_size, action_size, seed, network="Dueling", stepkey="Double"):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        print ("Architecture: " + str(network) + " " + str(stepkey) + " QN")
        self.stepkey = stepkey
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if (network=="Dueling"):
            self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        elif (network=="Convolutional"):
            self.qnetwork_local = ConvolutionalDuelingQNetwork(state_size, action_size, seed).to(device)
            self.qnetwork_target = ConvolutionalDuelingQNetwork(state_size, action_size, seed).to(device)             
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 
        print (self.qnetwork_local)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #10
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (string): which network to use
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #11
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 ddqn=False,
                 dueling=False,
                 init_td=1e-5,
                 prioritize_weight=0.0,
                 beta_scheduler=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network

        if not dueling:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        elif dueling:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if (prioritize_weight != 0.0):
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed,
                                                  prioritize_weight,
                                                  beta_scheduler)

            self.init_td = init_td
            self.prioritize_weight = prioritize_weight

        else:
            self.prioritize_weight = 0.0

            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        self.init_td = init_td

        # Initialize time step (for updating every  steps)
        self.t_step = 0

        self.ddqn = ddqn
    def __init__(self, state_size, action_size, seed, prioritized=False):
        """Dueling Q network agent."""
        super().__init__(state_size, action_size, seed)

        # Dueling Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)  # use GPU or not
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
Exemple #13
0
    def __init__(self, seed, **kwargs):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.seed = random.seed(seed)

        # Hyper-parameters
        self.network_args = kwargs.get('network_args', {})
        self.buffer_size = kwargs.get('buffer_size', BUFFER_SIZE)
        self.batch_size = kwargs.get('batch_size', BATCH_SIZE)
        self.gamma = kwargs.get('gamma', GAMMA)
        self.tau = kwargs.get('tau', TAU)
        self.update_every = kwargs.get('update_every', UPDATE_EVERY)
        self.lr = kwargs.get('lr', LR)
        self.double_q = kwargs.get('double_q', False)
        self.dueling = kwargs.get('dueling', False)
        self.ray_layer = kwargs.get('ray_layer', False)

        # Q-Network
        if self.dueling:
            if self.ray_layer:
                self.qnetwork_local = DuelingQNetworkWithRayLayer(
                    seed, **self.network_args).to(device)
                self.qnetwork_target = DuelingQNetworkWithRayLayer(
                    seed, **self.network_args).to(device)
            else:
                self.qnetwork_local = DuelingQNetwork(
                    seed, **self.network_args).to(device)
                self.qnetwork_target = DuelingQNetwork(
                    seed, **self.network_args).to(device)
        else:
            if self.ray_layer:
                self.qnetwork_local = QNetworkWithRayLayer(
                    seed, **self.network_args).to(device)
                self.qnetwork_target = QNetworkWithRayLayer(
                    seed, **self.network_args).to(device)
            else:
                self.qnetwork_local = QNetwork(seed,
                                               **self.network_args).to(device)
                self.qnetwork_target = QNetwork(seed,
                                                **self.network_args).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(ACTION_SIZE, self.buffer_size,
                                   self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
Exemple #14
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 double_dqn=True,
                 priority_replay=True,
                 dueling_network=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.B = B_START

        self.double_dqn = double_dqn
        self.priority_replay = priority_replay
        self.dueling_network = dueling_network

        # Q-Network
        if self.dueling_network:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.priority_replay:
            self.memory = PrioritizedReplayBuffer(state_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  use_rank=False)
        else:
            self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self,
                 state_size,
                 action_size,
                 parameters,
                 evaluation_mode=False):
        self.evaluation_mode = evaluation_mode

        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = True
        self.hidsize = 1

        if not evaluation_mode:
            self.hidsize = parameters.hidden_size
            self.buffer_size = parameters.buffer_size
            self.batch_size = parameters.batch_size
            self.update_every = parameters.update_every
            self.learning_rate = parameters.learning_rate
            self.tau = parameters.tau
            self.gamma = parameters.gamma
            self.buffer_min_size = parameters.buffer_min_size

        # Device
        if parameters.use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda:0")
            print(" Using GPU")
            print(" GPU")

        else:
            self.device = torch.device("cpu")
            print(" Using CPU")

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size,
                                              action_size,
                                              hidsize1=self.hidsize,
                                              hidsize2=self.hidsize).to(
                                                  self.device)

        if not evaluation_mode:
            self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.learning_rate)
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, self.device)

            self.t_step = 0
            self.loss = 0.0
Exemple #16
0
    def __init__(self, state_size, action_size, seed, lr_decay_rate=0.999):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        # self.qnetwork_target.eval() # No need to compute gradients 
        print(self.qnetwork_target)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, 1)
       

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 DDQN=False,
                 PRB=False,
                 Dueling=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            DDQN (bool): apply Double DDQN algorithm
            PRB (bool): use a Prioritized ReplayBuffer
            Dueling (bool): use a Dueling NN-architecture
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.DDQN = DDQN
        self.PRB = PRB

        # Q-Network
        if Dueling:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.PRB:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, ALPHA,
                                                  BETA_START, BETA_INCREASE)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #18
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        super(DuelingAgent, self).__init__(state_size, action_size, seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.model = DuelingQNetwork(state_size, action_size, seed).to(device)
#         self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
#         for target_param, param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()):
#             target_param.data.copy_(param)
            
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self, env_name, epsilon=0.05, n_frames=4):

        self.env_name = env_name

        self.env = gym.make(env_name)

        self.action_space = self.env.action_space.n

        self.epsilon = epsilon

        self.n_frames = n_frames

        self.frames = collections.deque(maxlen=n_frames)

        self.qnet = DuelingQNetwork(action_space=self.action_space)

        self.define_network()
    def __init__(self, state_size, action_size, seed, args):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = args.double_dqn
        self.dueling_dqn = args.dueling_dqn
        self.args = args
        assert self.double_dqn * self.dueling_dqn == 0
        if self.double_dqn:
            print("Implementing Double DQN!")
        elif self.dueling_dqn:
            print("Implementing Dueling DQN!")
        else:
            print("Implementing DQN")

        # Q-Network
        if self.dueling_dqn:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.args.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args.buffer_size,
                                   args.batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #22
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_sizes=[64, 64],
                 flavor='plain'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_sizes (list): list of neurons in each layer
            flavor (str): flavor of the network - plain, double, dueling, double-dueling
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.hidden_sizes = hidden_sizes
        self.flavor = flavor

        # Q-Network
        if self.flavor == 'plain' or self.flavor == 'double':
            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           hidden_sizes).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            hidden_sizes).to(device)
        # Dueling Q-Network
        if self.flavor == 'dueling' or self.flavor == 'double-dueling':
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed,
                                                  hidden_sizes).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed,
                                                   hidden_sizes).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #23
0
    def __init__(self, id, state_size, action_size, seed, use_double=False, use_prio=False, use_dueling=False):
        """Initialize an Agent object.
        
        Params
        ======
            id (int): id used to identify the agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double (boolean): Use Double DQN algorithm
            use_prio (boolean): Use Prioritized Experience Replay
            use_dueling (boolean): Use Dueling DQN algorithm
        """
        self.state_size = state_size
        self.action_size = action_size
        self.id = id

        self.use_double = use_double
        self.use_prio = use_prio
        self.use_dueling = use_dueling
        self.seed = random.seed(seed)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        if use_dueling:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(self.device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(self.device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device)
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        
        # Replay memory
        if use_prio:
            self.memory = NaivePrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, PRIO_ALPHA, PRIO_EPSILON)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
 def _create_nn(self, nn_type, state_size, action_size, seed, device):
     if nn_type == 'noisydueling':
         self._sample_noise = True
         return NoisyDuelingQNetwork(state_size,
                                     action_size,
                                     seed,
                                     device=device).to(device)
     elif nn_type == 'dueling':
         return DuelingQNetwork(state_size, action_size, seed).to(device)
     elif nn_type == 'q':
         return QNetwork(state_size, action_size, seed).to(device)
     else:
         raise Exception(
             'Unknown NN type - must be one of NoisyDueling, Dueling or Q')
Exemple #25
0
    def __init__(self, state_size, action_size, mem_length=100000, ddqn=True):
        self.gamma = 0.99
        self.batch_size = 64
        self.action_size = action_size
        self.ddqn = ddqn

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        if ddqn:
            self.model = DuelingQNetwork(state_size,
                                         action_size).to(self.device)
            self.target_model = DuelingQNetwork(state_size,
                                                action_size).to(self.device)
            self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
            self.experience = self.ddqn_experience
        else:
            self.model = QNetwork(state_size, action_size).to(self.device)
            self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
            self.experience = self.dqn_experience

        # replay memory
        self.memory = deque(maxlen=mem_length)
    def __init__(self, pid, env_name, epsilon, alpha, buffer_size, n_frames,
                 gamma, nstep, reward_clip):

        self.pid = pid

        self.env = gym.make(env_name)

        self.epsilon = epsilon

        self.gamma = gamma

        self.alpha = alpha

        self.n_frames = n_frames

        self.action_space = self.env.action_space.n

        self.frames = collections.deque(maxlen=n_frames)

        self.nstep = nstep

        self.buffer_size = buffer_size

        self.local_buffer = LocalReplayBuffer(reward_clip=reward_clip,
                                              gamma=gamma,
                                              nstep=nstep)

        self.local_qnet = DuelingQNetwork(action_space=self.action_space)

        self.episode_steps = 0

        self.episode_rewards = 0

        self.lives = 5  #: Breakout only

        self.define_network()
Exemple #27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (string): which network to use
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Double DQN
        # Local network picks action
        next_action = self.qnetwork_local(next_states).detach().argmax(
            1).unsqueeze(1)
        # Target network estimates the value of said action
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, next_action)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDDQNPolicy(Policy):
    """Dueling Double DQN policy"""
    def __init__(self,
                 state_size,
                 action_size,
                 parameters,
                 evaluation_mode=False):
        self.evaluation_mode = evaluation_mode

        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = True
        self.hidsize = 1

        if not evaluation_mode:
            self.hidsize = parameters.hidden_size
            self.buffer_size = parameters.buffer_size
            self.batch_size = parameters.batch_size
            self.update_every = parameters.update_every
            self.learning_rate = parameters.learning_rate
            self.tau = parameters.tau
            self.gamma = parameters.gamma
            self.buffer_min_size = parameters.buffer_min_size

        # Device
        if parameters.use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda:0")
            print(" Using GPU")
            print(" GPU")

        else:
            self.device = torch.device("cpu")
            print(" Using CPU")

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size,
                                              action_size,
                                              hidsize1=self.hidsize,
                                              hidsize2=self.hidsize).to(
                                                  self.device)

        if not evaluation_mode:
            self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.learning_rate)
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, self.device)

            self.t_step = 0
            self.loss = 0.0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        assert not self.evaluation_mode, "Policy has been initialized for evaluation only."

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.buffer_min_size and len(
                    self.memory) > self.batch_size:
                self._learn()

    def _learn(self):
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            # Double DQN
            q_best_action = self.qnetwork_local(next_states).max(1)[1]
            q_targets_next = self.qnetwork_target(next_states).gather(
                1, q_best_action.unsqueeze(-1))
        else:
            # DQN
            q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))

        # Compute loss
        self.loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        # Update target network
        self._soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def _soft_update(self, local_model, target_model, tau):
        # Soft update model parameters.
        # θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename + ".local")
        torch.save(self.qnetwork_target.state_dict(), filename + ".target")

    def load(self, filename):
        if os.path.exists(filename + ".local"):
            self.qnetwork_local.load_state_dict(
                torch.load(filename + ".local",
                           map_location=torch.device('cpu')))
            print('local')
        if os.path.exists(filename + ".target"):
            self.qnetwork_target.load_state_dict(
                torch.load(filename + ".target",
                           map_location=torch.device('cpu')))
            print('target')
class Actor:
    def __init__(self, pid, env_name, epsilon, alpha, buffer_size, n_frames,
                 gamma, nstep, reward_clip):

        self.pid = pid

        self.env = gym.make(env_name)

        self.epsilon = epsilon

        self.gamma = gamma

        self.alpha = alpha

        self.n_frames = n_frames

        self.action_space = self.env.action_space.n

        self.frames = collections.deque(maxlen=n_frames)

        self.nstep = nstep

        self.buffer_size = buffer_size

        self.local_buffer = LocalReplayBuffer(reward_clip=reward_clip,
                                              gamma=gamma,
                                              nstep=nstep)

        self.local_qnet = DuelingQNetwork(action_space=self.action_space)

        self.episode_steps = 0

        self.episode_rewards = 0

        self.lives = 5  #: Breakout only

        self.define_network()

    def define_network(self):

        #: hide GPU from remote actor
        tf.config.set_visible_devices([], 'GPU')

        #: define by run
        frame = preprocess_frame(self.env.reset())
        for _ in range(self.n_frames):
            self.frames.append(frame)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]
        self.local_qnet(state)

    def rollout(self, current_weights):

        tf.config.set_visible_devices([], 'GPU')

        self.local_qnet.set_weights(current_weights)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        for _ in range(self.buffer_size):

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.local_qnet.sample_action(state, self.epsilon)

            next_frame, reward, done, info = self.env.step(action)

            self.episode_steps += 1

            self.episode_rewards += reward

            self.frames.append(preprocess_frame(next_frame))

            next_state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            if self.lives != info["ale.lives"]:
                #: loss of life as episode ends
                transition = (state, action, reward, next_state, True)
                self.lives = info["ale.lives"]
            else:
                transition = (state, action, reward, next_state, done)

            self.local_buffer.push(transition)

            if done:
                print(self.pid, self.episode_steps, self.episode_rewards,
                      round(self.epsilon, 3))
                self.episode_steps = 0
                self.episode_rewards = 0
                self.lives = 5
                frame = preprocess_frame(self.env.reset())
                for _ in range(self.n_frames):
                    self.frames.append(frame)

        experiences = self.local_buffer.pull()

        states = np.vstack([exp.state
                            for exp in experiences]).astype(np.float32)
        actions = np.vstack([exp.action
                             for exp in experiences]).astype(np.float32)
        rewards = np.array([exp.reward for exp in experiences]).reshape(-1, 1)
        next_states = np.vstack([exp.next_state
                                 for exp in experiences]).astype(np.float32)
        dones = np.array([exp.done for exp in experiences]).reshape(-1, 1)

        next_actions, next_qvalues = self.local_qnet.sample_actions(
            next_states)

        next_actions_onehot = tf.one_hot(next_actions, self.action_space)

        max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                         axis=1,
                                         keepdims=True)

        TQ = rewards + self.gamma**(self.nstep) * (1 -
                                                   dones) * max_next_qvalues

        qvalues = self.local_qnet(states)
        actions_onehot = tf.one_hot(actions.flatten().astype(np.int32),
                                    self.action_space)
        Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)

        priorities = ((np.abs(TQ - Q) + 0.001)**self.alpha).flatten()

        experiences = [zlib.compress(pickle.dumps(exp)) for exp in experiences]

        return priorities, experiences, self.pid
class RemoteTestActor:
    def __init__(self, env_name, epsilon=0.05, n_frames=4):

        self.env_name = env_name

        self.env = gym.make(env_name)

        self.action_space = self.env.action_space.n

        self.epsilon = epsilon

        self.n_frames = n_frames

        self.frames = collections.deque(maxlen=n_frames)

        self.qnet = DuelingQNetwork(action_space=self.action_space)

        self.define_network()

    def define_network(self):

        #: hide GPU from remote actor
        tf.config.set_visible_devices([], 'GPU')

        #: define by run
        frame = preprocess_frame(self.env.reset())
        for _ in range(self.n_frames):
            self.frames.append(frame)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        self.qnet(state)

    def get_layers(self, idx):
        return self.qnet.layers[idx:]

    def play(self, current_weights, epsilon=0.01):

        tf.config.set_visible_devices([], 'GPU')

        self.qnet.set_weights(current_weights)

        episode_steps, episode_rewards = 0, 0

        frame = preprocess_frame(self.env.reset())
        for _ in range(self.n_frames):
            self.frames.append(frame)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        done = False
        while not done:

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.qnet.sample_action(state, epsilon=epsilon)

            next_frame, reward, done, _ = self.env.step(action)

            self.frames.append(preprocess_frame(next_frame))

            episode_steps += 1

            episode_rewards += reward

            if episode_steps > 1000 and episode_rewards < 10:
                break

        return episode_steps, episode_rewards

    def play_with_video(self, checkpoint_path, monitor_dir, epsilon=0.01):

        monitor_dir = Path(monitor_dir)
        if monitor_dir.exists():
            shutil.rmtree(monitor_dir)
        monitor_dir.mkdir()
        env = gym.wrappers.Monitor(gym.make(self.env_name),
                                   monitor_dir,
                                   force=True,
                                   video_callable=(lambda ep: True))

        frame = preprocess_frame(env.reset())
        frames = collections.deque([frame] * self.n_frames,
                                   maxlen=self.n_frames)

        state = np.stack(frames, axis=2)[np.newaxis, ...]
        self.qnet(state)
        self.qnet.load_weights(checkpoint_path)

        episode_steps, episode_rewards = 0, 0

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        done = False
        while not done:

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.qnet.sample_action(state, epsilon)

            next_frame, reward, done, _ = self.env.step(action)

            self.frames.append(preprocess_frame(next_frame))

            episode_steps += 1

            episode_rewards += reward

        return episode_rewards