def test(num=500): agent = DuelingDQN(env.observation_space.shape[0], env.action_space.n, e=0) batch_size = 32 agent.load_model() steps = [] for i_episode in range(num): old_observation = env.reset() old_action = env.action_space.sample() done = False step = 0 while not done: step = step + 1 # env.render() observation, reward, done, info = env.step(old_action) if done: reward = -200 old_observation = observation old_action = agent.get_action( np.reshape(observation, [1, env.observation_space.shape[0]])) if done: steps.append(step) print("{}:{} steps".format(i_episode, step)) break # if the average steps of consecutive 100 games is lower than a standard # we consider the method passes the game if len(steps) > 200 and sum(steps[-200:]) / 200 >= 195: print(sum(steps[-200:]) / 200) break
def __init__(self, gamma, epsilon, learning_rate, n_actions, input_dimensions, memory_size, size_of_batch, \ min_epsilon=0.01, decrement_epsilon=5e-7, replace=1000, mdl_checkpoint='temp/duelingDDQN'): self.gamma = gamma self.epsilon = epsilon self.learning_rate = learning_rate self.n_actions = n_actions self.input_dimensions = input_dimensions self.size_of_batch = size_of_batch self.min_epsilon = min_epsilon self.decrement_epsilon = decrement_epsilon self.cnt_target_replace = replace self.mdl_checkpoint = mdl_checkpoint self.action_space = [i for i in range(self.n_actions)] self.memory = ExperienceReplay(memory_size, input_dimensions) self.counter_learn = 0 # instance of the deep q-network - tell value of the current state self.q_network_eval = DuelingDQN( self.learning_rate, self.n_actions, input_dimensions=self.input_dimensions, name='lunarLanderDuelingDDQN_q_network_eval', mdl_checkpoint=self.mdl_checkpoint) # instance of the deep q-network - tell value of the next actions self.q_network_next = DuelingDQN( self.learning_rate, self.n_actions, input_dimensions=self.input_dimensions, name='lunarLanderDuelingDDQN_q_network_next', mdl_checkpoint=self.mdl_checkpoint)
def __init__(self, env, learning_rate, gamma, buffer_size, prioritized): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.prioritized = prioritized if self.prioritized == False: self.replay_buffer = BasicBuffer(max_size=buffer_size) else: self.replay_buffer = PrioritizedBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = DuelingDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss()
class Agent: def __init__(self, gamma, epsilon, learning_rate, n_actions, input_dimensions, memory_size, size_of_batch, \ min_epsilon=0.01, decrement_epsilon=5e-7, replace=1000, mdl_checkpoint='temp/duelingDDQN'): self.gamma = gamma self.epsilon = epsilon self.learning_rate = learning_rate self.n_actions = n_actions self.input_dimensions = input_dimensions self.size_of_batch = size_of_batch self.min_epsilon = min_epsilon self.decrement_epsilon = decrement_epsilon self.cnt_target_replace = replace self.mdl_checkpoint = mdl_checkpoint self.action_space = [i for i in range(self.n_actions)] self.memory = ExperienceReplay(memory_size, input_dimensions) self.counter_learn = 0 # instance of the deep q-network - tell value of the current state self.q_network_eval = DuelingDQN( self.learning_rate, self.n_actions, input_dimensions=self.input_dimensions, name='lunarLanderDuelingDDQN_q_network_eval', mdl_checkpoint=self.mdl_checkpoint) # instance of the deep q-network - tell value of the next actions self.q_network_next = DuelingDQN( self.learning_rate, self.n_actions, input_dimensions=self.input_dimensions, name='lunarLanderDuelingDDQN_q_network_next', mdl_checkpoint=self.mdl_checkpoint) # choosing action def action_choice(self, observe): # exploitation if np.random.random() > self.epsilon: # convert observation to state tensor (PyTorch tensor), send it to our device and feed-forward # it through our network and get the advantage function out state = torch.tensor([observe], dtype=torch.float).to( self.q_network_eval.device) # we get the advantage function out. We don't care about the value function # as it is just a constant; does not affect anything hence _ is placed _, advntge = self.q_network_eval.forward(state) # the advantage function is used to calculate the maximal action # the argmax function returns a PyTorch tensor, which the OpenAI Gym will not accept as # input for its step function, so a numpy array is passed with the .item() function action = torch.argmax(advntge).item() # exploration else: action = np.random.choice( self.action_space) # random choice from the action space return action # storing state-action transitions (interface with agent's memory) def transition_store(self, state, action, reward, new_state, flag_done): self.memory.transition_store(state, action, reward, new_state, flag_done) def target_network_replace(self): # counter_learn is how many times the agent executed the learning function if self.counter_learn % self.cnt_target_replace == 0: # loading the state dictionary from the evaluation network onto the Q next network self.q_network_next.load_state_dict( self.q_network_eval.state_dict()) # linear epsilon decay def epsilon_decay(self): self.epsilon = self.epsilon - self.decrement_epsilon if self.epsilon > self.min_epsilon else self.min_epsilon # agent's saving network functionality def models_save(self): self.q_network_eval.checkpoint_save() self.q_network_next.checkpoint_save() # agent's loading network functionality def models_load(self): self.q_network_eval.checkpoint_load() self.q_network_next.checkpoint_load() # learning functionality def learn(self): # addressing if the agent hasn't filled up enough memory to preform learning # e.g. size of batch = 64 memory samples in each learning step. Lets say the agent # has so far only completed 10 steps or even 1 step. So there is not enough memory # yet to satisfy the set size of batch. We handle this by waiting until the agent fills # up its memory to the size of batch. if self.memory.memory_counter < self.size_of_batch: return # in pytorch the first thing you want to in a learning function is zeroing the gradience on the optimizer self.q_network_eval.optimizer.zero_grad() self.target_network_replace() # sampling of memory state, action, reward, next_state, flag_done = self.memory.buffer_sample( self.size_of_batch) # converting numpy arrays to pytorch tensors states = torch.tensor(state).to(self.q_network_eval.device) actions = torch.tensor(action).to(self.q_network_eval.device) flag_dones = torch.tensor(flag_done).to(self.q_network_eval.device) rewards = torch.tensor(reward).to(self.q_network_eval.device) next_states = torch.tensor(next_state).to(self.q_network_eval.device) # array from 0 to size_of_batch-1 that handles array indexing and slicing later on indices = np.arange(self.size_of_batch) # passing in states and next states to the respective networks value_s, advantage_s = self.q_network_eval.forward(states) value_s_new, advantage_s_new = self.q_network_next.forward(next_states) # this line comes from the methodology of the paper introducing Double Deep Q-learning value_s_eval, advantage_s_eval = self.q_network_eval.forward( next_states) # the former three quantities pairs are needed to perform the update rule # based on the paper introducing Double Deep Q-learning # dueling aggregation of value and advantage functions # In the paper introducing Dueling Deep Q-learning they settle on summing the value and advantage function # with normalizing by subtracting off the mean of the advantage stream. Summing them alone without this # normalization step will lead a problem called "identifiability", which is discussed in the report. # the array indexing on the line below, takes the indices of the size_of_batch as an array of indices and the # values of the actions the agent actually took by taking the actions sub-array q_network_pred = torch.add( value_s, (advantage_s - advantage_s.mean(dim=1, keepdim=True)))[indices, actions] # we perform the indexing below as we want it for all actions q_network_next = torch.add( value_s_new, (advantage_s_new - advantage_s_new.mean(dim=1, keepdim=True))) q_network_eval = torch.add( value_s_eval, (advantage_s_eval - advantage_s_eval.mean(dim=1, keepdim=True))) # maximal actions of the next state according to the evaluation network maximum_actions = torch.argmax(q_network_eval, dim=1) # evaluating rewards for which the next state is terminal # does not value future states that are flagged as terminal q_network_next[flag_dones] = 0.0 # quantity of the target value is q_network_next according to the evaluation network q_network_target = rewards + self.gamma * q_network_next[ indices, maximum_actions] # calculation of the loss function loss = self.q_network_eval.loss_func( q_network_target, q_network_pred).to(self.q_network_eval.device) # back-propagation loss.backward() # stepping the optimiser self.q_network_eval.optimizer.step() # increment learn function counter self.counter_learn += 1 # epsilon decay self.epsilon_decay()
class DuelingAgent: def __init__(self, env, learning_rate, gamma, buffer_size, prioritized): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.prioritized = prioritized if self.prioritized == False: self.replay_buffer = BasicBuffer(max_size=buffer_size) else: self.replay_buffer = PrioritizedBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = DuelingDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def get_action(self, state, eps=0.20): #state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) if (np.random.randn() > eps): return self.env.action_space.sample() return action def tensor_states(self, states_list): states_tensor = torch.Tensor() for state in states_list: state_float = torch.FloatTensor(state).to(self.device) states_tensor = torch.cat((states_tensor, state_float)) return states_tensor def compute_loss(self, batch): if self.prioritized == False: states, actions, rewards, next_states, dones = batch else: states, actions, rewards, next_states, dones = batch[0] states = self.tensor_states(states) next_states = self.tensor_states(next_states) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) dones = torch.FloatTensor(dones).to(self.device) curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
import numpy as np import os from IHiterEnv.parameter import TP from IHiterEnv.policy import RandomPolicy from IHiterEnv.agent import TeamAction MaxEpisode = 2000 MaxEpisodeSteps = 100000 if not os.path.exists(os.path.abspath('.') \ + '/train_data'): train_file = os.path.abspath('.') \ + '/train_data' + '/' os.mkdir(train_file) RLBrain = DuelingDQN(train_dir=train_file) env = ICRA_Env() team_action = TeamAction() with open(train_file + 'param.txt', 'w') as f: f.writelines([ "learning rate : " + str(RLBrain.LearningRate) + '\n', "ReplaceTargetIter", str(RLBrain.ReplaceTargetIter) + '\n', "BatchSize", str(RLBrain.BatchSize) + '\n', "Epsilon", str(RLBrain.Epsilon) + '\n', "EpsilonMin : " + str(RLBrain.EpsilonMin) + '\n', "Gamma", str(RLBrain.Gamma) ])
from DuelingDQN import DuelingDQN env_name = 'CartPole-v0' episodes = 3000 steps = 300 test = 10 lr = 0.0001 ini_epsilon = 0.3 decay_steps = 200000 # 调用agent中learn方法decay_steps后,ini_epsilon会衰减到0 replay_size = 100 gamma = 0.9 batch_size = 32 update_frequency = 10 # 训练update_frequency次,更新一次目标网络参数 env = gym.make(env_name) agent = DuelingDQN(env, lr, ini_epsilon, decay_steps, replay_size, gamma, batch_size, update_frequency) def train(): for episode in range(episodes): state = env.reset() for step in range(steps): action = agent.greedy_action(state) next_state, reward, done, _ = env.step(action) reward = -1 if done else 0.1 agent.store_transition(state, action, reward, next_state, done) agent.learn() state = next_state if done: