Beispiel #1
0
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.batch_size = batch_size
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def choose_action(self, observation):
        raise NotImplementedError

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
Beispiel #2
0
class Noisy_DQN_Agent(object):
    def __init__(self,
                 env,
                 input_dim,
                 n_actions,
                 alpha,
                 gamma,
                 batch_size,
                 lr=5e-4,
                 memory_size=10000000,
                 replace_target=5,
                 filename='noisy_dqn.h5'):
        self.env = env
        self.action_space = np.arange(n_actions)
        self.input_dim = input_dim
        self.n_actions = n_actions
        self.alpha = alpha  #learning rate
        self.gamma = gamma  #discount factor
        self.batch_size = batch_size
        self.filename = filename
        self.memory = ReplayBuffer(memory_size, input_dim)
        self.scores = []  # to keep track of scores
        self.avg_scores = []
        self.replace_target = replace_target
        self.online_network = Neural_Network(
            lr, n_actions, input_dim)  #network for evaluation
        self.target_network = Neural_Network(
            lr, n_actions, input_dim)  #network for computing target
        # online and target network are the same except that parameters of target network
        # are copied each "replace target" steps from online network's parameters and kept
        # fixed on all other steps

    # to interface with memory
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    # choose greedy action (exploration is kept with noisy nets)
    def choose_action(self, state):
        state = state.reshape(1, -1)
        actions = self.online_network.predict(state)
        action = np.argmax(actions)
        return action

    def update_online(self):  #update parameters of the online network
        #we start learning after at least batch_size sample in memory
        if self.memory.memory_count < self.batch_size:
            return
        states, actions, rewards, new_states, done = self.memory.sample_buffer(
            self.batch_size)
        q_estimate = self.online_network.predict(states)
        q_next = self.target_network.predict(
            new_states)  # used to compute target
        q_target = q_estimate.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        q_target[batch_index, actions] = rewards + self.gamma * np.max(
            q_next, axis=1) * (1 - done)
        # if episode over, 1-done = 0 , Q(terminal,)=0
        self.online_network.fit(states, q_target, verbose=0)
        if self.memory.memory_count % self.replace_target == 0:
            self.update_target()

    def update_target(
            self
    ):  #update the parameters of target network from online network
        self.target_network.set_weights(self.online_network.get_weights())

    def train(self, n_games, path):
        # path : path where to save the model
        for i in range(n_games):
            score = 0
            done = False
            state = self.env.reset()
            while not done:
                action = self.choose_action(state)
                new_state, reward, done, info = self.env.step(action)
                score += reward
                self.remember(state, action, reward, new_state, done)
                state = new_state
                self.update_online()
            self.scores.append(score)
            avg_score = np.mean(self.scores[max(0, i - 50):i +
                                            1])  # rolling score : mean
            self.avg_scores.append(avg_score)
            print('episode ', i, 'score = %.2f' % score,
                  ' Rolling-score = %.2f' % avg_score)
            # save the model after 100 games
            if i % 100 == 0 and i > 0:
                self.save_model(path)

    def save_model(self, path):
        self.online_network.save(path + '/' + self.filename)

    def load_model(self, path):
        self.online_network = load_model(path)
Beispiel #3
0
class Agent():
    def __init__(self,
                 lr,
                 gamma,
                 epsilon,
                 batch_size,
                 input_dims,
                 env,
                 epsilon_dec=1e-3,
                 epsilon_end=0.01,
                 mem_size=1000,
                 fname='dqn_model.h1'):
        self.env = env
        self.action_space = self.env.action_space  #discrete
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = epsilon_dec
        self.eps_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.n_actions = self.env.num_action
        self.q_eval = build_dqn(lr, self.n_actions, input_dims, 256, 256)

    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, observation):
        # epsilon greedy to choose action, maybe later can try out Boltzman...
        if np.random.random() < self.epsilon:
            action = self.action_space.sample(
            )  #random select an action from action space
        else:
            state = np.array([observation])
            actions = self.q_eval.predict(state)  # Q(a,s)
            action = np.argmax(actions)

        return action

    def learn(self):
        # DQN
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, dones = \
                self.memory.sample_buffer(self.batch_size)

        #actions = actions.reshape(-1, 1)
        #rewards = rewards.reshape(-1, 1)
        #dones = dones.reshape(-1, 1)

        q_eval = self.q_eval.predict(
            states)  # Q(a,s) for all actions, for all states in batch

        q_next = self.q_eval.predict(states_)  # Q(a,s_) for all actions....

        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size, dtype=np.int32)  # 0-63


        q_target[batch_index, actions] = rewards + \
                        self.gamma * np.max(q_next,1)*dones

        self.q_eval.train_on_batch(states, q_target)

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
                self.eps_min else self.eps_min

    def save_model(self):
        self.q_eval.save(self.model_file)

    def load_model(self):
        self.q_eval = load_model(self.model_file)
Beispiel #4
0
class Agent(object):
    def __init__(self,
                 lr,
                 input_dims,
                 n_actions,
                 epsilon,
                 batch_size,
                 env,
                 capacity=1000000,
                 eps_dec=4.5e-7,
                 fc1_dims=512,
                 fc2_dims=256,
                 replace=1000,
                 gamma=0.99,
                 network_name='_eval'):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_min = 0.01
        self.epsilon = epsilon
        self.env = env
        self.memory = ReplayBuffer(capacity, input_dims, n_actions)
        self.eps_dec = eps_dec
        self.replace = replace
        self.update_cntr = 0
        self.scaler = self._get_scaler(env)

        # Evaluate network
        self.q_eval = DDQN(lr=lr,
                           input_dims=self.input_dims,
                           n_actions=self.n_actions,
                           fc1_dims=fc1_dims,
                           fc2_dims=fc2_dims,
                           network_name=network_name)
        # Training Network
        self.q_train = DDQN(lr=lr,
                            input_dims=self.input_dims,
                            n_actions=self.n_actions,
                            fc1_dims=fc1_dims,
                            fc2_dims=fc2_dims,
                            network_name=network_name)

    # Normalize the observation
    def pick_action(self, obs):
        if np.random.random() > self.epsilon:
            obs = self.scaler.transform([obs])
            state = T.tensor([obs], dtype=T.float).to(self.q_eval.device)
            actions = self.q_train.forward(state)
            action = T.argmax(actions).item()
        else:
            action = self.env.sample_action()

        return action

# For normalizing states -- _get_scaler(env)

    def _get_scaler(self, env):
        states = []
        for _ in range(self.env.n_steps):
            action = self.env.sample_action()
            state_, reward, done, _ = self.env.step(action)
            states.append(state_)
            if done:
                break
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def store_transition(self, state, action, reward, state_, done):
        state = self.scaler.transform([state])
        state_ = self.scaler.transform([state_])
        self.memory.store_transition(state, action, reward, state_, done)

    def update_target_network(self):
        if self.update_cntr % self.replace == 0:
            self.q_eval.load_state_dict(self.q_train.state_dict())

    def save(self):
        print('Saving...')
        self.q_eval.save()
        self.q_train.save()

    def load(self):
        print('Loading...')
        self.q_eval.load()
        self.q_train.load()

    # Normalize the states, create a function
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, done = self.memory.sample_buffer(
            self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.q_eval.device)
        actions = T.tensor(actions, dtype=T.int64).to(self.q_eval.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device)
        done = T.tensor(done, dtype=T.bool).to(self.q_eval.device)

        self.q_train.optimizer.zero_grad()
        self.update_target_network()

        indices = np.arange(self.batch_size)
        q_pred = (self.q_train.forward(states) * actions).sum(dim=1)
        q_next = self.q_eval.forward(states_)
        q_train = self.q_train.forward(states_)

        max_action = T.argmax(q_train, dim=1)
        q_next[done] = 0.0

        y = rewards + self.gamma * q_next[indices, max_action]

        loss = self.q_train.loss(y, q_pred).to(self.q_eval.device)
        loss.backward()

        self.q_train.optimizer.step()

        self.update_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
Beispiel #5
0
class DQNAgent():
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size=32,
                 eps_min=0.1,
                 eps_dec=1e-5,
                 tau=1000,
                 env_name='Pong',
                 chkpt_dir='models/'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.tau = tau
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(lr, n_actions, f'{env_name}_q_eval.pth',
                                   input_dims, chkpt_dir)
        self.q_next = DeepQNetwork(lr, n_actions, f'{env_name}_q_next.pth',
                                   input_dims, chkpt_dir).eval()

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],
                                 dtype=torch.float).to(self.q_eval.device)
            action = self.q_eval.forward(state).argmax().item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, states_, done = self.memory.sample_buffer(
            self.batch_size)
        states = torch.tensor(state).to(self.q_eval.device)
        rewards = torch.tensor(reward).to(self.q_eval.device)
        dones = torch.tensor(done).to(self.q_eval.device)
        actions = torch.tensor(action).to(self.q_eval.device)
        states_s = torch.tensor(states_).to(self.q_eval.device)
        return states, actions, rewards, states_s, dones

    def update_target_network(self):
        if self.learn_step_counter % self.tau == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_eps(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.batch_size > self.memory.mem_cntr:
            return

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(
            states
        )[indices,
          actions]  # select the values only for actions the agent have taken actions 0 or 1
        with torch.no_grad():
            q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[
            dones] = 0.0  # for terminal states, there's no other state ahead, so reward = 0.
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        self.q_eval.optimizer.zero_grad()
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.update_target_network(
        )  # decide to update or not the weights of q_next
        self.decrement_eps()
Beispiel #6
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=0.9999,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn',
                 device='cuda:0'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.device = device

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        # Create policy and target DQN models
        self.policy = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'policy',
                          chkpt_dir=self.chkpt_dir)
        self.target = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'target',
                          chkpt_dir=self.chkpt_dir)

        # put on correct device (GPU or CPU)
        self.policy.to(device)
        self.target.to(device)

        # Optimizer
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        # Loss
        self.loss = nn.MSELoss()

    def choose_action(self, observation):
        # Choose an action
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],
                                 dtype=torch.float).to(self.device)
            actions = self.policy.forward(state)
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(state).to(self.device)
        rewards = torch.tensor(reward).to(self.device)
        dones = torch.tensor(done).to(self.device)
        actions = torch.tensor(action).to(self.device)
        states_ = torch.tensor(new_state).to(self.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.target.load_state_dict(self.policy.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon *= self.eps_dec

    def save_models(self):
        self.policy.save_checkpoint()

    def load_models(self):
        self.policy.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.policy.forward(states)[indices, actions]
        q_next = self.target.forward(states_).max(dim=1)[0]

        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.loss(q_target, q_pred).to(self.device)
        loss.backward()
        self.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()