class Agent: """ The intelligent agent of the simulation. Set the model of the neural network used and general parameters. It is responsible to select the actions, optimize the neural network and manage the models. """ def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE) def append_sample(self, state, action, next_state, reward): """ save sample (error,<s,a,s',r>) to the replay memory """ # Define if is the end of the simulation done = True if next_state is None else False # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state) state_action_values = state_action_values.gather(1, action.view(-1,1)) if not done: # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward else: expected_state_action_values = reward error = abs(state_action_values - expected_state_action_values).data.cpu().numpy() self.memory.add(error, state, action, next_state, reward) def select_action(self, state, train=True): """ Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken """ global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) a = self.policy_net(state) return a.max(1)[1].view(1, 1), a.max(0) else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None """ def select_action(self, state, train=True): Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) #set the network to train mode is important to enable dropout self.policy_net.train() output_list = [] # Retrieve the outputs from neural network feedfoward n times to build a statistic model for i in range(Config.STOCHASTIC_PASSES): #print(agent.policy_net(data)) output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0)) #print(output_list[i]) self.policy_net.eval() # The result of the network is the mean of n passes output_mean = torch.cat(output_list, 0).mean(0) q_value = output_mean.data.cpu().numpy().max() action = output_mean.max(1)[1].view(1, 1) uncertainty = torch.cat(output_list, 0).var(0).mean().item() return action, q_value, uncertainty else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None """ def optimize_model(self): """ Perform one step of optimization on the neural network """ if self.memory.tree.n_entries < Config.BATCH_SIZE: return transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Update priorities errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy() # update priority for i in range(Config.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) loss_return = loss.item() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss_return def save(self, step, logs_path, label): """ Save the model on hard disc Parameters ---------- step: int current step on the simulation logs_path: string path to where we will store the model label: string label that will be used to store the model """ os.makedirs(logs_path + label, exist_ok=True) full_label = label + str(step) + '.pth' logs_path = os.path.join(logs_path, label, full_label) self.policy_net.save(logs_path, step=step, optimizer=self.optimizer) def restore(self, logs_path): """ Load the model from hard disc Parameters ---------- logs_path: string path to where we will store the model """ self.policy_net.load(logs_path) self.target_net.load(logs_path)
class TD3Agent(object): def __init__(self, state_size, action_size, actor_lr, critic_lr, tau, gamma, lambd, batch_size, memory_size, actor_delay, target_noise, epsilon, epsilon_end, decay_step, load_model, play): self.state_size = state_size self.vel_size = 3 self.action_size = action_size self.action_high = 1.5 self.action_low = -self.action_high self.actor_lr = actor_lr self.critic_lr = critic_lr self.tau = tau self.gamma = gamma self.lambd = lambd self.actor_delay = actor_delay self.target_noise = target_noise self.batch_size = batch_size self.memory_size = memory_size self.epsilon = epsilon self.epsilon_end = epsilon_end self.decay_step = decay_step self.epsilon_decay = (epsilon - epsilon_end) / decay_step if play: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) self.sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) else: self.sess = tf.Session() K.set_session(self.sess) self.actor, self.critic, self.critic2 = self.build_model() self.target_actor, self.target_critic, self.target_critic2 = self.build_model( ) self.actor_update = self.build_actor_optimizer() self.critic_update = self.build_critic_optimizer() self.critic2_update = self.build_critic2_optimizer() self.sess.run(tf.global_variables_initializer()) if load_model: self.load_model('./save_model/' + agent_name) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.target_critic2.set_weights(self.critic2.get_weights()) self.memory = Memory(self.memory_size) def build_model(self): # shared network # image process image = Input(shape=self.state_size) image_process = BatchNormalization()(image) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', padding='same', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((3, 3)))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(8, (1, 1), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(Flatten())(image_process) image_process = GRU(48, kernel_initializer='he_normal', use_bias=False)(image_process) image_process = BatchNormalization()(image_process) image_process = Activation('tanh')(image_process) # vel process vel = Input(shape=[self.vel_size]) vel_process = Dense(48, kernel_initializer='he_normal', use_bias=False)(vel) vel_process = BatchNormalization()(vel_process) vel_process = Activation('tanh')(vel_process) # state process # state_process = Concatenate()([image_process, vel_process]) state_process = Add()([image_process, vel_process]) # Actor policy = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_process) policy = BatchNormalization()(policy) policy = ELU()(policy) policy = Dense(32, kernel_initializer='he_normal', use_bias=False)(policy) policy = BatchNormalization()(policy) policy = ELU()(policy) policy = Dense(self.action_size, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(policy) policy = Lambda( lambda x: K.clip(x, self.action_low, self.action_high))(policy) actor = Model(inputs=[image, vel], outputs=policy) # Critic action = Input(shape=[self.action_size]) action_process = Dense(48, kernel_initializer='he_normal', use_bias=False)(action) action_process = BatchNormalization()(action_process) action_process = Activation('tanh')(action_process) state_action = Add()([state_process, action_process]) Qvalue = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_action) Qvalue = BatchNormalization()(Qvalue) Qvalue = ELU()(Qvalue) Qvalue = Dense(32, kernel_initializer='he_normal', use_bias=False)(Qvalue) Qvalue = BatchNormalization()(Qvalue) Qvalue = ELU()(Qvalue) Qvalue = Dense(1, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(Qvalue) critic = Model(inputs=[image, vel, action], outputs=Qvalue) # Critic2 action = Input(shape=[self.action_size]) action_process2 = Dense(48, kernel_initializer='he_normal', use_bias=False)(action) action_process2 = BatchNormalization()(action_process2) action_process2 = Activation('tanh')(action_process2) state_action2 = Add()([state_process, action_process2]) Qvalue2 = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_action2) Qvalue2 = BatchNormalization()(Qvalue2) Qvalue2 = ELU()(Qvalue2) Qvalue2 = Dense(32, kernel_initializer='he_normal', use_bias=False)(Qvalue2) Qvalue2 = BatchNormalization()(Qvalue2) Qvalue2 = ELU()(Qvalue2) Qvalue2 = Dense(1, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(Qvalue2) critic2 = Model(inputs=[image, vel, action], outputs=Qvalue2) actor._make_predict_function() critic._make_predict_function() critic2._make_predict_function() return actor, critic, critic2 def build_actor_optimizer(self): pred_Q = self.critic.output action_grad = tf.gradients(pred_Q, self.critic.input[2]) target = -action_grad[0] / self.batch_size params_grad = tf.gradients(self.actor.output, self.actor.trainable_weights, target) params_grad, global_norm = tf.clip_by_global_norm(params_grad, 5.0) grads = zip(params_grad, self.actor.trainable_weights) optimizer = tf.train.AdamOptimizer(self.actor_lr) updates = optimizer.apply_gradients(grads) train = K.function( [self.actor.input[0], self.actor.input[1], self.critic.input[2]], [global_norm], updates=[updates]) return train def build_critic_optimizer(self): y = K.placeholder(shape=(None, 1), dtype='float32') pred = self.critic.output loss = K.mean(K.square(pred - y)) # Huber Loss # error = K.abs(y - pred) # quadratic = K.clip(error, 0.0, 1.0) # linear = error - quadratic # loss = K.mean(0.5 * K.square(quadratic) + linear) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([ self.critic.input[0], self.critic.input[1], self.critic.input[2], y ], [pred, loss], updates=updates) return train def build_critic2_optimizer(self): y = K.placeholder(shape=(None, 1), dtype='float32') pred = self.critic2.output loss = K.mean(K.square(pred - y)) # # Huber Loss # error = K.abs(y - pred) # quadratic = K.clip(error, 0.0, 1.0) # linear = error - quadratic # loss = K.mean(0.5 * K.square(quadratic) + linear) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic2.trainable_weights, [], loss) train = K.function([ self.critic2.input[0], self.critic2.input[1], self.critic2.input[2], y ], [loss], updates=updates) return train def get_action(self, state): policy = self.actor.predict(state)[0] noise = np.random.normal(0, self.epsilon, self.action_size) action = np.clip(policy + noise, self.action_low, self.action_high) return action, policy def train_model(self): batch, idxs, _ = self.memory.sample(self.batch_size) images = np.zeros([self.batch_size] + self.state_size) vels = np.zeros([self.batch_size, self.vel_size]) actions = np.zeros((self.batch_size, self.action_size)) rewards = np.zeros((self.batch_size, 1)) next_images = np.zeros([self.batch_size] + self.state_size) next_vels = np.zeros([self.batch_size, self.vel_size]) dones = np.zeros((self.batch_size, 1)) targets = np.zeros((self.batch_size, 1)) for i, sample in enumerate(batch): images[i], vels[i] = sample[0] actions[i] = sample[1] rewards[i] = sample[2] next_images[i], next_vels[i] = sample[3] dones[i] = sample[4] states = [images, vels] next_states = [next_images, next_vels] policy = self.actor.predict(states) target_actions = self.target_actor.predict(next_states) target_noises = np.random.normal(0, self.target_noise, target_actions.shape) target_actions = np.clip(target_actions + target_noises, self.action_low, self.action_high) target_next_Qs1 = self.target_critic.predict(next_states + [target_actions]) target_next_Qs2 = self.target_critic2.predict(next_states + [target_actions]) target_next_Qs = np.minimum(target_next_Qs1, target_next_Qs2) targets = rewards + self.gamma * (1 - dones) * target_next_Qs critic_loss = 0 for _ in range(self.actor_delay): pred, c_loss = self.critic_update(states + [actions, targets]) c2_loss = self.critic2_update(states + [actions, targets]) critic_loss += c_loss + c2_loss[0] actor_loss = self.actor_update(states + [policy]) tds = np.abs(pred - targets) for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, tds[i]) return actor_loss[0], critic_loss / (self.actor_delay * 2.0) def append_memory(self, state, action, reward, next_state, done): Q = self.critic.predict(state + [action.reshape(1, -1)])[0] target_action = self.target_actor.predict(next_state)[0] target_Q1 = self.target_critic.predict( next_state + [target_action.reshape(1, -1)])[0] target_Q2 = self.target_critic2.predict( next_state + [target_action.reshape(1, -1)])[0] target_Q = np.minimum(target_Q1, target_Q2) td = reward + (1 - done) * self.gamma * target_Q - Q td = float(abs(td[0])) self.memory.add(td, (state, action, reward, next_state, done)) return td def load_model(self, name): if os.path.exists(name + '_actor.h5'): self.actor.load_weights(name + '_actor.h5') print('Actor loaded') if os.path.exists(name + '_critic.h5'): self.critic.load_weights(name + '_critic.h5') print('Critic loaded') if os.path.exists(name + '_critic2.h5'): self.critic2.load_weights(name + '_critic2.h5') print('Critic2 loaded') def save_model(self, name): self.actor.save_weights(name + '_actor.h5') self.critic.save_weights(name + '_critic.h5') self.critic2.save_weights(name + '_critic2.h5') def update_target_model(self): self.target_actor.set_weights( self.tau * np.array(self.actor.get_weights()) \ + (1 - self.tau) * np.array(self.target_actor.get_weights()) ) self.target_critic.set_weights( self.tau * np.array(self.critic.get_weights()) \ + (1 - self.tau) * np.array(self.target_critic.get_weights()) ) self.target_critic2.set_weights( self.tau * np.array(self.critic2.get_weights()) \ + (1 - self.tau) * np.array(self.target_critic2.get_weights()) )