class Agent:
	"""
	The intelligent agent of the simulation. Set the model of the neural network used and general parameters.
	It is responsible to select the actions, optimize the neural network and manage the models.
	"""

	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)


	 
	def append_sample(self, state, action, next_state, reward):
		"""
		save sample (error,<s,a,s',r>) to the replay memory
		"""

		# Define if is the end of the simulation
		done = True if next_state is None else False

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state)
		state_action_values = state_action_values.gather(1, action.view(-1,1))

		
		if not done:
			# Compute argmax Q(s', a; θ)		
			next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1)

			# Compute Q(s', argmax Q(s', a; θ), θ-)
			next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach()

			# Compute the expected Q values
			expected_state_action_values = (next_state_values * Config.GAMMA) + reward
		else:
			expected_state_action_values = reward


		error = abs(state_action_values - expected_state_action_values).data.cpu().numpy()


		self.memory.add(error, state, action, next_state, reward)

	def select_action(self, state, train=True):
		"""
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		"""
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				a = self.policy_net(state)
				return a.max(1)[1].view(1, 1), a.max(0)
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None

	"""
	def select_action(self, state, train=True):
		
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				#set the network to train mode is important to enable dropout
				self.policy_net.train()
				output_list = []
				# Retrieve the outputs from neural network feedfoward n times to build a statistic model
				for i in range(Config.STOCHASTIC_PASSES):
					#print(agent.policy_net(data))
					output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0))
					#print(output_list[i])

				self.policy_net.eval()
				# The result of the network is the mean of n passes
				output_mean = torch.cat(output_list, 0).mean(0)
				q_value = output_mean.data.cpu().numpy().max()
				action = output_mean.max(1)[1].view(1, 1)

				uncertainty = torch.cat(output_list, 0).var(0).mean().item()
				
				return action, q_value, uncertainty
				
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None

	"""
	def optimize_model(self):
		"""
		Perform one step of optimization on the neural network
		"""

		if self.memory.tree.n_entries < Config.BATCH_SIZE:
			return
		transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE)

		# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
											  batch.next_state)), device=self.device, dtype=torch.uint8)
		non_final_next_states = torch.cat([s for s in batch.next_state
													if s is not None])
		
		state_batch = torch.cat(batch.state)
		action_batch = torch.cat(batch.action)
		reward_batch = torch.cat(batch.reward)
		
		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state_batch).gather(1, action_batch)
		
	
		# Compute argmax Q(s', a; θ)		
		next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1)

		# Compute Q(s', argmax Q(s', a; θ), θ-)
		next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device)
		next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach()

		# Compute the expected Q values
		expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch

		# Update priorities
		errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy()
		
		# update priority
		for i in range(Config.BATCH_SIZE):
			idx = idxs[i]
			self.memory.update(idx, errors[i])


		# Compute Huber loss
		loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
		loss_return = loss.item()

		# Optimize the model
		self.optimizer.zero_grad()
		loss.backward()
		for param in self.policy_net.parameters():
			param.grad.data.clamp_(-1, 1)
		self.optimizer.step()

		return loss_return

	def save(self, step, logs_path, label):
		"""
		Save the model on hard disc

		Parameters
		----------
			step: int
				current step on the simulation
			logs_path: string
				path to where we will store the model
			label: string
				label that will be used to store the model
		"""

		os.makedirs(logs_path + label, exist_ok=True)

		full_label = label + str(step) + '.pth'
		logs_path = os.path.join(logs_path, label, full_label)

		self.policy_net.save(logs_path, step=step, optimizer=self.optimizer)
	
	def restore(self, logs_path):
		"""
		Load the model from hard disc

		Parameters
		----------
			logs_path: string
				path to where we will store the model
		"""
		self.policy_net.load(logs_path)
		self.target_net.load(logs_path)
Beispiel #2
0
class TD3Agent(object):
    def __init__(self, state_size, action_size, actor_lr, critic_lr, tau,
                 gamma, lambd, batch_size, memory_size, actor_delay,
                 target_noise, epsilon, epsilon_end, decay_step, load_model,
                 play):
        self.state_size = state_size
        self.vel_size = 3
        self.action_size = action_size
        self.action_high = 1.5
        self.action_low = -self.action_high
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.tau = tau
        self.gamma = gamma
        self.lambd = lambd
        self.actor_delay = actor_delay
        self.target_noise = target_noise

        self.batch_size = batch_size
        self.memory_size = memory_size
        self.epsilon = epsilon
        self.epsilon_end = epsilon_end
        self.decay_step = decay_step
        self.epsilon_decay = (epsilon - epsilon_end) / decay_step

        if play:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
            self.sess = tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options))
        else:
            self.sess = tf.Session()
        K.set_session(self.sess)

        self.actor, self.critic, self.critic2 = self.build_model()
        self.target_actor, self.target_critic, self.target_critic2 = self.build_model(
        )
        self.actor_update = self.build_actor_optimizer()
        self.critic_update = self.build_critic_optimizer()
        self.critic2_update = self.build_critic2_optimizer()
        self.sess.run(tf.global_variables_initializer())
        if load_model:
            self.load_model('./save_model/' + agent_name)

        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())

        self.memory = Memory(self.memory_size)

    def build_model(self):
        # shared network
        # image process
        image = Input(shape=self.state_size)
        image_process = BatchNormalization()(image)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   padding='same',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((3, 3)))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(32, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(32, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(8, (1, 1), activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(Flatten())(image_process)
        image_process = GRU(48, kernel_initializer='he_normal',
                            use_bias=False)(image_process)
        image_process = BatchNormalization()(image_process)
        image_process = Activation('tanh')(image_process)

        # vel process
        vel = Input(shape=[self.vel_size])
        vel_process = Dense(48, kernel_initializer='he_normal',
                            use_bias=False)(vel)
        vel_process = BatchNormalization()(vel_process)
        vel_process = Activation('tanh')(vel_process)

        # state process
        # state_process = Concatenate()([image_process, vel_process])
        state_process = Add()([image_process, vel_process])

        # Actor
        policy = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(state_process)
        policy = BatchNormalization()(policy)
        policy = ELU()(policy)
        policy = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(policy)
        policy = BatchNormalization()(policy)
        policy = ELU()(policy)
        policy = Dense(self.action_size,
                       kernel_initializer=tf.random_uniform_initializer(
                           minval=-3e-3, maxval=3e-3))(policy)
        policy = Lambda(
            lambda x: K.clip(x, self.action_low, self.action_high))(policy)
        actor = Model(inputs=[image, vel], outputs=policy)

        # Critic
        action = Input(shape=[self.action_size])
        action_process = Dense(48,
                               kernel_initializer='he_normal',
                               use_bias=False)(action)
        action_process = BatchNormalization()(action_process)
        action_process = Activation('tanh')(action_process)
        state_action = Add()([state_process, action_process])

        Qvalue = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(state_action)
        Qvalue = BatchNormalization()(Qvalue)
        Qvalue = ELU()(Qvalue)
        Qvalue = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(Qvalue)
        Qvalue = BatchNormalization()(Qvalue)
        Qvalue = ELU()(Qvalue)
        Qvalue = Dense(1,
                       kernel_initializer=tf.random_uniform_initializer(
                           minval=-3e-3, maxval=3e-3))(Qvalue)
        critic = Model(inputs=[image, vel, action], outputs=Qvalue)

        # Critic2
        action = Input(shape=[self.action_size])
        action_process2 = Dense(48,
                                kernel_initializer='he_normal',
                                use_bias=False)(action)
        action_process2 = BatchNormalization()(action_process2)
        action_process2 = Activation('tanh')(action_process2)
        state_action2 = Add()([state_process, action_process2])

        Qvalue2 = Dense(32, kernel_initializer='he_normal',
                        use_bias=False)(state_action2)
        Qvalue2 = BatchNormalization()(Qvalue2)
        Qvalue2 = ELU()(Qvalue2)
        Qvalue2 = Dense(32, kernel_initializer='he_normal',
                        use_bias=False)(Qvalue2)
        Qvalue2 = BatchNormalization()(Qvalue2)
        Qvalue2 = ELU()(Qvalue2)
        Qvalue2 = Dense(1,
                        kernel_initializer=tf.random_uniform_initializer(
                            minval=-3e-3, maxval=3e-3))(Qvalue2)
        critic2 = Model(inputs=[image, vel, action], outputs=Qvalue2)

        actor._make_predict_function()
        critic._make_predict_function()
        critic2._make_predict_function()

        return actor, critic, critic2

    def build_actor_optimizer(self):
        pred_Q = self.critic.output
        action_grad = tf.gradients(pred_Q, self.critic.input[2])
        target = -action_grad[0] / self.batch_size
        params_grad = tf.gradients(self.actor.output,
                                   self.actor.trainable_weights, target)
        params_grad, global_norm = tf.clip_by_global_norm(params_grad, 5.0)
        grads = zip(params_grad, self.actor.trainable_weights)
        optimizer = tf.train.AdamOptimizer(self.actor_lr)
        updates = optimizer.apply_gradients(grads)
        train = K.function(
            [self.actor.input[0], self.actor.input[1], self.critic.input[2]],
            [global_norm],
            updates=[updates])
        return train

    def build_critic_optimizer(self):
        y = K.placeholder(shape=(None, 1), dtype='float32')
        pred = self.critic.output

        loss = K.mean(K.square(pred - y))
        # Huber Loss
        # error = K.abs(y - pred)
        # quadratic = K.clip(error, 0.0, 1.0)
        # linear = error - quadratic
        # loss = K.mean(0.5 * K.square(quadratic) + linear)

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic.trainable_weights, [],
                                        loss)
        train = K.function([
            self.critic.input[0], self.critic.input[1], self.critic.input[2], y
        ], [pred, loss],
                           updates=updates)
        return train

    def build_critic2_optimizer(self):
        y = K.placeholder(shape=(None, 1), dtype='float32')
        pred = self.critic2.output

        loss = K.mean(K.square(pred - y))
        # # Huber Loss
        # error = K.abs(y - pred)
        # quadratic = K.clip(error, 0.0, 1.0)
        # linear = error - quadratic
        # loss = K.mean(0.5 * K.square(quadratic) + linear)

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic2.trainable_weights, [],
                                        loss)
        train = K.function([
            self.critic2.input[0], self.critic2.input[1],
            self.critic2.input[2], y
        ], [loss],
                           updates=updates)
        return train

    def get_action(self, state):
        policy = self.actor.predict(state)[0]
        noise = np.random.normal(0, self.epsilon, self.action_size)
        action = np.clip(policy + noise, self.action_low, self.action_high)
        return action, policy

    def train_model(self):
        batch, idxs, _ = self.memory.sample(self.batch_size)

        images = np.zeros([self.batch_size] + self.state_size)
        vels = np.zeros([self.batch_size, self.vel_size])
        actions = np.zeros((self.batch_size, self.action_size))
        rewards = np.zeros((self.batch_size, 1))
        next_images = np.zeros([self.batch_size] + self.state_size)
        next_vels = np.zeros([self.batch_size, self.vel_size])
        dones = np.zeros((self.batch_size, 1))

        targets = np.zeros((self.batch_size, 1))

        for i, sample in enumerate(batch):
            images[i], vels[i] = sample[0]
            actions[i] = sample[1]
            rewards[i] = sample[2]
            next_images[i], next_vels[i] = sample[3]
            dones[i] = sample[4]
        states = [images, vels]
        next_states = [next_images, next_vels]
        policy = self.actor.predict(states)
        target_actions = self.target_actor.predict(next_states)
        target_noises = np.random.normal(0, self.target_noise,
                                         target_actions.shape)
        target_actions = np.clip(target_actions + target_noises,
                                 self.action_low, self.action_high)

        target_next_Qs1 = self.target_critic.predict(next_states +
                                                     [target_actions])
        target_next_Qs2 = self.target_critic2.predict(next_states +
                                                      [target_actions])
        target_next_Qs = np.minimum(target_next_Qs1, target_next_Qs2)
        targets = rewards + self.gamma * (1 - dones) * target_next_Qs

        critic_loss = 0
        for _ in range(self.actor_delay):
            pred, c_loss = self.critic_update(states + [actions, targets])
            c2_loss = self.critic2_update(states + [actions, targets])
            critic_loss += c_loss + c2_loss[0]
        actor_loss = self.actor_update(states + [policy])
        tds = np.abs(pred - targets)
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, tds[i])

        return actor_loss[0], critic_loss / (self.actor_delay * 2.0)

    def append_memory(self, state, action, reward, next_state, done):
        Q = self.critic.predict(state + [action.reshape(1, -1)])[0]
        target_action = self.target_actor.predict(next_state)[0]
        target_Q1 = self.target_critic.predict(
            next_state + [target_action.reshape(1, -1)])[0]
        target_Q2 = self.target_critic2.predict(
            next_state + [target_action.reshape(1, -1)])[0]
        target_Q = np.minimum(target_Q1, target_Q2)
        td = reward + (1 - done) * self.gamma * target_Q - Q
        td = float(abs(td[0]))
        self.memory.add(td, (state, action, reward, next_state, done))
        return td

    def load_model(self, name):
        if os.path.exists(name + '_actor.h5'):
            self.actor.load_weights(name + '_actor.h5')
            print('Actor loaded')
        if os.path.exists(name + '_critic.h5'):
            self.critic.load_weights(name + '_critic.h5')
            print('Critic loaded')
        if os.path.exists(name + '_critic2.h5'):
            self.critic2.load_weights(name + '_critic2.h5')
            print('Critic2 loaded')

    def save_model(self, name):
        self.actor.save_weights(name + '_actor.h5')
        self.critic.save_weights(name + '_critic.h5')
        self.critic2.save_weights(name + '_critic2.h5')

    def update_target_model(self):
        self.target_actor.set_weights(
            self.tau * np.array(self.actor.get_weights()) \
            + (1 - self.tau) * np.array(self.target_actor.get_weights())
        )
        self.target_critic.set_weights(
            self.tau * np.array(self.critic.get_weights()) \
            + (1 - self.tau) * np.array(self.target_critic.get_weights())
        )
        self.target_critic2.set_weights(
            self.tau * np.array(self.critic2.get_weights()) \
            + (1 - self.tau) * np.array(self.target_critic2.get_weights())
        )