class Agent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.eps_min = eps_min self.eps_dec = eps_dec self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.batch_size = batch_size self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.memory = ReplayBuffer(mem_size, input_dims, n_actions) def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def choose_action(self, observation): raise NotImplementedError def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def learn(self): raise NotImplementedError def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class Noisy_DQN_Agent(object): def __init__(self, env, input_dim, n_actions, alpha, gamma, batch_size, lr=5e-4, memory_size=10000000, replace_target=5, filename='noisy_dqn.h5'): self.env = env self.action_space = np.arange(n_actions) self.input_dim = input_dim self.n_actions = n_actions self.alpha = alpha #learning rate self.gamma = gamma #discount factor self.batch_size = batch_size self.filename = filename self.memory = ReplayBuffer(memory_size, input_dim) self.scores = [] # to keep track of scores self.avg_scores = [] self.replace_target = replace_target self.online_network = Neural_Network( lr, n_actions, input_dim) #network for evaluation self.target_network = Neural_Network( lr, n_actions, input_dim) #network for computing target # online and target network are the same except that parameters of target network # are copied each "replace target" steps from online network's parameters and kept # fixed on all other steps # to interface with memory def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) # choose greedy action (exploration is kept with noisy nets) def choose_action(self, state): state = state.reshape(1, -1) actions = self.online_network.predict(state) action = np.argmax(actions) return action def update_online(self): #update parameters of the online network #we start learning after at least batch_size sample in memory if self.memory.memory_count < self.batch_size: return states, actions, rewards, new_states, done = self.memory.sample_buffer( self.batch_size) q_estimate = self.online_network.predict(states) q_next = self.target_network.predict( new_states) # used to compute target q_target = q_estimate.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) q_target[batch_index, actions] = rewards + self.gamma * np.max( q_next, axis=1) * (1 - done) # if episode over, 1-done = 0 , Q(terminal,)=0 self.online_network.fit(states, q_target, verbose=0) if self.memory.memory_count % self.replace_target == 0: self.update_target() def update_target( self ): #update the parameters of target network from online network self.target_network.set_weights(self.online_network.get_weights()) def train(self, n_games, path): # path : path where to save the model for i in range(n_games): score = 0 done = False state = self.env.reset() while not done: action = self.choose_action(state) new_state, reward, done, info = self.env.step(action) score += reward self.remember(state, action, reward, new_state, done) state = new_state self.update_online() self.scores.append(score) avg_score = np.mean(self.scores[max(0, i - 50):i + 1]) # rolling score : mean self.avg_scores.append(avg_score) print('episode ', i, 'score = %.2f' % score, ' Rolling-score = %.2f' % avg_score) # save the model after 100 games if i % 100 == 0 and i > 0: self.save_model(path) def save_model(self, path): self.online_network.save(path + '/' + self.filename) def load_model(self, path): self.online_network = load_model(path)
class Agent(): def __init__(self, lr, gamma, epsilon, batch_size, input_dims, env, epsilon_dec=1e-3, epsilon_end=0.01, mem_size=1000, fname='dqn_model.h1'): self.env = env self.action_space = self.env.action_space #discrete self.gamma = gamma self.epsilon = epsilon self.eps_dec = epsilon_dec self.eps_min = epsilon_end self.batch_size = batch_size self.model_file = fname self.memory = ReplayBuffer(mem_size, input_dims) self.n_actions = self.env.num_action self.q_eval = build_dqn(lr, self.n_actions, input_dims, 256, 256) def store_transition(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, observation): # epsilon greedy to choose action, maybe later can try out Boltzman... if np.random.random() < self.epsilon: action = self.action_space.sample( ) #random select an action from action space else: state = np.array([observation]) actions = self.q_eval.predict(state) # Q(a,s) action = np.argmax(actions) return action def learn(self): # DQN if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, dones = \ self.memory.sample_buffer(self.batch_size) #actions = actions.reshape(-1, 1) #rewards = rewards.reshape(-1, 1) #dones = dones.reshape(-1, 1) q_eval = self.q_eval.predict( states) # Q(a,s) for all actions, for all states in batch q_next = self.q_eval.predict(states_) # Q(a,s_) for all actions.... q_target = np.copy(q_eval) batch_index = np.arange(self.batch_size, dtype=np.int32) # 0-63 q_target[batch_index, actions] = rewards + \ self.gamma * np.max(q_next,1)*dones self.q_eval.train_on_batch(states, q_target) self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \ self.eps_min else self.eps_min def save_model(self): self.q_eval.save(self.model_file) def load_model(self): self.q_eval = load_model(self.model_file)
class Agent(object): def __init__(self, lr, input_dims, n_actions, epsilon, batch_size, env, capacity=1000000, eps_dec=4.5e-7, fc1_dims=512, fc2_dims=256, replace=1000, gamma=0.99, network_name='_eval'): self.input_dims = input_dims self.n_actions = n_actions self.batch_size = batch_size self.gamma = gamma self.eps_min = 0.01 self.epsilon = epsilon self.env = env self.memory = ReplayBuffer(capacity, input_dims, n_actions) self.eps_dec = eps_dec self.replace = replace self.update_cntr = 0 self.scaler = self._get_scaler(env) # Evaluate network self.q_eval = DDQN(lr=lr, input_dims=self.input_dims, n_actions=self.n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, network_name=network_name) # Training Network self.q_train = DDQN(lr=lr, input_dims=self.input_dims, n_actions=self.n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, network_name=network_name) # Normalize the observation def pick_action(self, obs): if np.random.random() > self.epsilon: obs = self.scaler.transform([obs]) state = T.tensor([obs], dtype=T.float).to(self.q_eval.device) actions = self.q_train.forward(state) action = T.argmax(actions).item() else: action = self.env.sample_action() return action # For normalizing states -- _get_scaler(env) def _get_scaler(self, env): states = [] for _ in range(self.env.n_steps): action = self.env.sample_action() state_, reward, done, _ = self.env.step(action) states.append(state_) if done: break scaler = StandardScaler() scaler.fit(states) return scaler def store_transition(self, state, action, reward, state_, done): state = self.scaler.transform([state]) state_ = self.scaler.transform([state_]) self.memory.store_transition(state, action, reward, state_, done) def update_target_network(self): if self.update_cntr % self.replace == 0: self.q_eval.load_state_dict(self.q_train.state_dict()) def save(self): print('Saving...') self.q_eval.save() self.q_train.save() def load(self): print('Loading...') self.q_eval.load() self.q_train.load() # Normalize the states, create a function def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(states, dtype=T.float).to(self.q_eval.device) actions = T.tensor(actions, dtype=T.int64).to(self.q_eval.device) rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device) done = T.tensor(done, dtype=T.bool).to(self.q_eval.device) self.q_train.optimizer.zero_grad() self.update_target_network() indices = np.arange(self.batch_size) q_pred = (self.q_train.forward(states) * actions).sum(dim=1) q_next = self.q_eval.forward(states_) q_train = self.q_train.forward(states_) max_action = T.argmax(q_train, dim=1) q_next[done] = 0.0 y = rewards + self.gamma * q_next[indices, max_action] loss = self.q_train.loss(y, q_pred).to(self.q_eval.device) loss.backward() self.q_train.optimizer.step() self.update_cntr += 1 self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size=32, eps_min=0.1, eps_dec=1e-5, tau=1000, env_name='Pong', chkpt_dir='models/'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.tau = tau self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(lr, n_actions, f'{env_name}_q_eval.pth', input_dims, chkpt_dir) self.q_next = DeepQNetwork(lr, n_actions, f'{env_name}_q_next.pth', input_dims, chkpt_dir).eval() def choose_action(self, observation): if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to(self.q_eval.device) action = self.q_eval.forward(state).argmax().item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, states_, done = self.memory.sample_buffer( self.batch_size) states = torch.tensor(state).to(self.q_eval.device) rewards = torch.tensor(reward).to(self.q_eval.device) dones = torch.tensor(done).to(self.q_eval.device) actions = torch.tensor(action).to(self.q_eval.device) states_s = torch.tensor(states_).to(self.q_eval.device) return states, actions, rewards, states_s, dones def update_target_network(self): if self.learn_step_counter % self.tau == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_eps(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.batch_size > self.memory.mem_cntr: return states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward( states )[indices, actions] # select the values only for actions the agent have taken actions 0 or 1 with torch.no_grad(): q_next = self.q_next.forward(states_).max(dim=1)[0] q_next[ dones] = 0.0 # for terminal states, there's no other state ahead, so reward = 0. q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) self.q_eval.optimizer.zero_grad() loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.update_target_network( ) # decide to update or not the weights of q_next self.decrement_eps()
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=0.9999, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn', device='cuda:0'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.device = device self.memory = ReplayBuffer(mem_size, input_dims, n_actions) # Create policy and target DQN models self.policy = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'policy', chkpt_dir=self.chkpt_dir) self.target = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'target', chkpt_dir=self.chkpt_dir) # put on correct device (GPU or CPU) self.policy.to(device) self.target.to(device) # Optimizer self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Loss self.loss = nn.MSELoss() def choose_action(self, observation): # Choose an action if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to(self.device) actions = self.policy.forward(state) action = torch.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = torch.tensor(state).to(self.device) rewards = torch.tensor(reward).to(self.device) dones = torch.tensor(done).to(self.device) actions = torch.tensor(action).to(self.device) states_ = torch.tensor(new_state).to(self.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.target.load_state_dict(self.policy.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon *= self.eps_dec def save_models(self): self.policy.save_checkpoint() def load_models(self): self.policy.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.policy.forward(states)[indices, actions] q_next = self.target.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.loss(q_target, q_pred).to(self.device) loss.backward() self.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()