Example #1
0
	def __init__ (self, config, num, epsilon_min):
		#Define env
		self.epsilon = config['epsilon']

		self.epsilon_min = epsilon_min

		self.local_replay = []

		self.num = num

		self.Q = Network(config)

		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	def __init__(self):

		#Initialisationn of environnment variables
		self.replay = deque(maxlen = config["size_buffer"])
		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

		self.Q = Network(config).to(self.device)
		self.target_Q = Network(config).to(self.device)
		self.target_Q.load_state_dict(self.Q.state_dict()) #synchronization of the parameters

		#Initialisation of agent variables	
		self.epsilon = config["epsilon"]	
		self.target_update_counter = 0
		self.loss = None
		self.name = config["model_name"]
    def __init__(self, config):
        #Define networks

        self.Q = Network(config)
        self.target_Q = Network(config)
        self.target_Q.load_state_dict(self.Q.state_dict())
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        self.min_replay_size = config["min_replay_size"]

        self.target_update_counter = 0
        self.target_update = config["target_update"]

        self.learning_step = config["learning_step"]
Example #4
0
class Actor:
	def __init__ (self, config, num, epsilon_min):
		#Define env
		self.epsilon = config['epsilon']

		self.epsilon_min = epsilon_min

		self.local_replay = []

		self.num = num

		self.Q = Network(config)

		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	def save_model(self):
		torch.save(self.Q.state_dict(),"./ModelSaved/Actor" + str(self.epsilon_min)+".pth")
		print("Model saved")


	def choose_action(self, obs):
		#Choose a action according to greedy epsilon

		if np.random.uniform() < self.epsilon:
			action = np.random.choice(N_ACTIONS)
		else:
			y = self.Q(torch.tensor(obs[0], device=self.device, dtype=torch.float).unsqueeze(0),
						torch.tensor(obs[1],  device=self.device, dtype=torch.float).unsqueeze(0))
			action = torch.argmax(y).item()
		
		return action

	def update_epsilon(self):
		self.epsilon *= EPSILON_DECAY
		self.epsilon = max(self.epsilon, self.epsilon_min)

	def add_transition(self, obs, action, reward, next_obs, done):
		self.local_replay.append([obs, action, self.reward_clipping(reward), next_obs, done]) 

	def reward_clipping(self, reward):
		if reward > 1:
			reward = 1
		elif reward <-1:
			reward = -1
		return reward
class Agent:
	def __init__(self):

		#Initialisationn of environnment variables
		self.replay = deque(maxlen = config["size_buffer"])
		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

		self.Q = Network(config).to(self.device)
		self.target_Q = Network(config).to(self.device)
		self.target_Q.load_state_dict(self.Q.state_dict()) #synchronization of the parameters

		#Initialisation of agent variables	
		self.epsilon = config["epsilon"]	
		self.target_update_counter = 0
		self.loss = None
		self.name = config["model_name"]

	def save_model(self):
		torch.save(self.Q.state_dict(), "./ModelSaved/" + NAME + '.pth')
		print("Model saved")

	def add_transition(self, obs, action, reward, next_obs, done):
		self.replay.append((obs, action, self.reward_clipping(reward), next_obs, done)) 

	def reward_clipping(self, reward):
		if reward > 1:
			reward = 1
		elif reward <-1:
			reward = -1
		return reward
			
	def choose_action(self, obs):
		#Choose an action according epsilon-greedy
		if np.random.uniform() < self.epsilon:
			action = np.random.choice(N_ACTIONS)
		else:
			y = self.Q(torch.tensor(obs[0], device=self.device, dtype=torch.float).unsqueeze(0),
						torch.tensor(obs[1],  device=self.device, dtype=torch.float).unsqueeze(0))
			action = torch.argmax(y).item()
		return action

	def train_nn(self):
		
		if len(self.replay) < MIN_REPLAY_SIZE:
			return
		#Sample transitions from the minibatch 
		idx = np.random.choice(len(self.replay), BATCH_SIZE, replace=True) 
		mini_batch = np.array(self.replay)[idx]

		#Split data transitions into multiples tensors
		current_states_img = torch.tensor([transition[0][0] for transition in mini_batch], device=self.device, dtype=torch.float)
		current_states_nav = torch.tensor([transition[0][1] for transition in mini_batch], device=self.device, dtype=torch.float)
			
		actions = torch.tensor([transition[1] for transition in mini_batch], device=self.device, dtype=torch.long)
		rewards = torch.tensor([transition[2] for transition in mini_batch], device=self.device, dtype=torch.float)

		new_current_states_img = torch.tensor([transition[3][0] for transition in mini_batch], device=self.device, dtype=torch.float)
		new_current_states_nav = torch.tensor([transition[3][1] for transition in mini_batch], device=self.device, dtype=torch.float)
			
		dones = torch.tensor([not(transition[4]) for transition in mini_batch], device=self.device, dtype=torch.bool)


		#Estimate the next Q value with the target network

		actions_eval = torch.argmax(self.Q(new_current_states_img, new_current_states_nav), dim=1)

		next_state_values = self.target_Q(new_current_states_img, new_current_states_nav).gather(dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1)					
		values = rewards + GAMMA*next_state_values*dones

		target_values = self.Q(current_states_img, current_states_nav).gather(dim=1,index=actions.unsqueeze(-1)).squeeze(-1)


		#Perform a gradient descent step on the error
				#Compute the loss with MSE Loss
		loss_t = self.Q.loss_function(values, target_values)
		self.loss = loss_t

		self.Q.optimizer.zero_grad()
		loss_t.backward()
		for param in self.Q.parameters():
			param.grad.data.clamp(-1,1)
		self.Q.optimizer.step()

		self.update_target()
		self.update_epsilon()
	 
		 
	def update_target(self):
		#update target counter
		self.target_update_counter +=1   
		#Every C update target network 
		if self.target_update_counter > TARGET_UPDATE:
			self.target_Q.load_state_dict(self.Q.state_dict())
			self.target_update_counter = 0

	def update_epsilon(self):
		#update epsilon
		self.epsilon *= EPSILON_DECAY
		self.epsilon = max(self.epsilon, EPSILON_MIN)
class Learner:
    def __init__(self, config):
        #Define networks

        self.Q = Network(config)
        self.target_Q = Network(config)
        self.target_Q.load_state_dict(self.Q.state_dict())
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        self.min_replay_size = config["min_replay_size"]

        self.target_update_counter = 0
        self.target_update = config["target_update"]

        self.learning_step = config["learning_step"]

    def compute_td(self, mini_batch):
        current_states_img = torch.tensor(
            [transition[0][0] for transition in mini_batch],
            device=self.device,
            dtype=torch.float)
        current_states_nav = torch.tensor(
            [transition[0][1] for transition in mini_batch],
            device=self.device,
            dtype=torch.float)

        actions = torch.tensor([transition[1] for transition in mini_batch],
                               device=self.device,
                               dtype=torch.long)
        rewards = torch.tensor([transition[2] for transition in mini_batch],
                               device=self.device,
                               dtype=torch.float)

        new_current_states_img = torch.tensor(
            [transition[3][0] for transition in mini_batch],
            device=self.device,
            dtype=torch.float)
        new_current_states_nav = torch.tensor(
            [transition[3][1] for transition in mini_batch],
            device=self.device,
            dtype=torch.float)

        dones = torch.tensor(
            [not (transition[4]) for transition in mini_batch],
            device=self.device,
            dtype=torch.bool)

        actions_eval = torch.argmax(self.Q(new_current_states_img,
                                           new_current_states_nav),
                                    dim=1)
        next_state_values = self.target_Q(
            new_current_states_img, new_current_states_nav).gather(
                dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1)
        values = rewards + self.gamma * next_state_values * dones

        target_values = self.Q(current_states_img, current_states_nav).gather(
            dim=1, index=actions.unsqueeze(-1)).squeeze(-1)

        td_error = target_values - values
        return td_error.detach().cpu().numpy()

    def train_nn(self, replay, server):
        updated_step = 0
        start_mem = psutil.virtual_memory().used
        while replay.get_size() < MIN_REPLAY_SIZE:
            continue
        print("Start learning")
        writer = SummaryWriter(comment="LossLearner")
        while updated_step < LEARNING_STEP:

            #Sample random minibatch of transitions from replay
            r = replay.get_size()
            mini_batch, weight = replay.get_batch(
                BATCH_SIZE)  # appel remote object method

            weight = torch.tensor(weight,
                                  device=self.device,
                                  dtype=torch.float)
            #Split data transitions into multiples tensors
            current_states_img = torch.tensor(
                [transition[0][0] for transition in mini_batch],
                device=self.device,
                dtype=torch.float)
            current_states_nav = torch.tensor(
                [transition[0][1] for transition in mini_batch],
                device=self.device,
                dtype=torch.float)

            actions = torch.tensor(
                [transition[1] for transition in mini_batch],
                device=self.device,
                dtype=torch.long)
            rewards = torch.tensor(
                [transition[2] for transition in mini_batch],
                device=self.device,
                dtype=torch.float)

            new_current_states_img = torch.tensor(
                [transition[3][0] for transition in mini_batch],
                device=self.device,
                dtype=torch.float)
            new_current_states_nav = torch.tensor(
                [transition[3][1] for transition in mini_batch],
                device=self.device,
                dtype=torch.float)

            dones = torch.tensor(
                [not (transition[4]) for transition in mini_batch],
                device=self.device,
                dtype=torch.bool)

            actions_eval = torch.argmax(self.Q(new_current_states_img,
                                               new_current_states_nav),
                                        dim=1)

            next_state_values = self.target_Q(
                new_current_states_img, new_current_states_nav).gather(
                    dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1)

            values = rewards + GAMMA * next_state_values * dones

            target_values = self.Q(current_states_img,
                                   current_states_nav).gather(
                                       dim=1,
                                       index=actions.unsqueeze(-1)).squeeze(-1)

            td_error = target_values - values

            replay.update_error(
                td_error.detach().cpu().numpy())  # appel remote object method

            #fit/backpropagation
            self.Q.optimizer.zero_grad()
            loss_t = self.Q.loss_function(values * weight,
                                          target_values * weight)
            loss_t.backward()
            #for param in self.Q.parameters():
            #param.grad.data.clamp(-1,1)
            self.Q.optimizer.step()
            #self.replay.update_memory()
            self.update_target()
            server.update_params(
                self.return_params())  # appel remote object method
            updated_step += 1

            writer.add_scalar("Loss", loss_t, updated_step)
            writer.add_scalar("replay_size:", r, updated_step)
            writer.add_scalar("mem used:",
                              (psutil.virtual_memory().used - start_mem) /
                              1_000_000, updated_step)

        print("finish")
        torch.save(self.Q.state_dict(), "./ModelSaved/Learner.pth")
        writer.close()

    def update_target(self):
        self.target_update_counter += 1
        if self.target_update_counter > TARGET_UPDATE:
            self.target_Q.load_state_dict(self.Q.state_dict())
            self.target_update_counter = 0

    def return_params(self):
        params = []
        for q_param in (self.Q.parameters()):
            params.append(q_param.detach().cpu())
        return params