def __init__ (self, config, num, epsilon_min): #Define env self.epsilon = config['epsilon'] self.epsilon_min = epsilon_min self.local_replay = [] self.num = num self.Q = Network(config) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def __init__(self): #Initialisationn of environnment variables self.replay = deque(maxlen = config["size_buffer"]) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.Q = Network(config).to(self.device) self.target_Q = Network(config).to(self.device) self.target_Q.load_state_dict(self.Q.state_dict()) #synchronization of the parameters #Initialisation of agent variables self.epsilon = config["epsilon"] self.target_update_counter = 0 self.loss = None self.name = config["model_name"]
def __init__(self, config): #Define networks self.Q = Network(config) self.target_Q = Network(config) self.target_Q.load_state_dict(self.Q.state_dict()) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.min_replay_size = config["min_replay_size"] self.target_update_counter = 0 self.target_update = config["target_update"] self.learning_step = config["learning_step"]
class Actor: def __init__ (self, config, num, epsilon_min): #Define env self.epsilon = config['epsilon'] self.epsilon_min = epsilon_min self.local_replay = [] self.num = num self.Q = Network(config) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def save_model(self): torch.save(self.Q.state_dict(),"./ModelSaved/Actor" + str(self.epsilon_min)+".pth") print("Model saved") def choose_action(self, obs): #Choose a action according to greedy epsilon if np.random.uniform() < self.epsilon: action = np.random.choice(N_ACTIONS) else: y = self.Q(torch.tensor(obs[0], device=self.device, dtype=torch.float).unsqueeze(0), torch.tensor(obs[1], device=self.device, dtype=torch.float).unsqueeze(0)) action = torch.argmax(y).item() return action def update_epsilon(self): self.epsilon *= EPSILON_DECAY self.epsilon = max(self.epsilon, self.epsilon_min) def add_transition(self, obs, action, reward, next_obs, done): self.local_replay.append([obs, action, self.reward_clipping(reward), next_obs, done]) def reward_clipping(self, reward): if reward > 1: reward = 1 elif reward <-1: reward = -1 return reward
class Agent: def __init__(self): #Initialisationn of environnment variables self.replay = deque(maxlen = config["size_buffer"]) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.Q = Network(config).to(self.device) self.target_Q = Network(config).to(self.device) self.target_Q.load_state_dict(self.Q.state_dict()) #synchronization of the parameters #Initialisation of agent variables self.epsilon = config["epsilon"] self.target_update_counter = 0 self.loss = None self.name = config["model_name"] def save_model(self): torch.save(self.Q.state_dict(), "./ModelSaved/" + NAME + '.pth') print("Model saved") def add_transition(self, obs, action, reward, next_obs, done): self.replay.append((obs, action, self.reward_clipping(reward), next_obs, done)) def reward_clipping(self, reward): if reward > 1: reward = 1 elif reward <-1: reward = -1 return reward def choose_action(self, obs): #Choose an action according epsilon-greedy if np.random.uniform() < self.epsilon: action = np.random.choice(N_ACTIONS) else: y = self.Q(torch.tensor(obs[0], device=self.device, dtype=torch.float).unsqueeze(0), torch.tensor(obs[1], device=self.device, dtype=torch.float).unsqueeze(0)) action = torch.argmax(y).item() return action def train_nn(self): if len(self.replay) < MIN_REPLAY_SIZE: return #Sample transitions from the minibatch idx = np.random.choice(len(self.replay), BATCH_SIZE, replace=True) mini_batch = np.array(self.replay)[idx] #Split data transitions into multiples tensors current_states_img = torch.tensor([transition[0][0] for transition in mini_batch], device=self.device, dtype=torch.float) current_states_nav = torch.tensor([transition[0][1] for transition in mini_batch], device=self.device, dtype=torch.float) actions = torch.tensor([transition[1] for transition in mini_batch], device=self.device, dtype=torch.long) rewards = torch.tensor([transition[2] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_img = torch.tensor([transition[3][0] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_nav = torch.tensor([transition[3][1] for transition in mini_batch], device=self.device, dtype=torch.float) dones = torch.tensor([not(transition[4]) for transition in mini_batch], device=self.device, dtype=torch.bool) #Estimate the next Q value with the target network actions_eval = torch.argmax(self.Q(new_current_states_img, new_current_states_nav), dim=1) next_state_values = self.target_Q(new_current_states_img, new_current_states_nav).gather(dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1) values = rewards + GAMMA*next_state_values*dones target_values = self.Q(current_states_img, current_states_nav).gather(dim=1,index=actions.unsqueeze(-1)).squeeze(-1) #Perform a gradient descent step on the error #Compute the loss with MSE Loss loss_t = self.Q.loss_function(values, target_values) self.loss = loss_t self.Q.optimizer.zero_grad() loss_t.backward() for param in self.Q.parameters(): param.grad.data.clamp(-1,1) self.Q.optimizer.step() self.update_target() self.update_epsilon() def update_target(self): #update target counter self.target_update_counter +=1 #Every C update target network if self.target_update_counter > TARGET_UPDATE: self.target_Q.load_state_dict(self.Q.state_dict()) self.target_update_counter = 0 def update_epsilon(self): #update epsilon self.epsilon *= EPSILON_DECAY self.epsilon = max(self.epsilon, EPSILON_MIN)
class Learner: def __init__(self, config): #Define networks self.Q = Network(config) self.target_Q = Network(config) self.target_Q.load_state_dict(self.Q.state_dict()) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.min_replay_size = config["min_replay_size"] self.target_update_counter = 0 self.target_update = config["target_update"] self.learning_step = config["learning_step"] def compute_td(self, mini_batch): current_states_img = torch.tensor( [transition[0][0] for transition in mini_batch], device=self.device, dtype=torch.float) current_states_nav = torch.tensor( [transition[0][1] for transition in mini_batch], device=self.device, dtype=torch.float) actions = torch.tensor([transition[1] for transition in mini_batch], device=self.device, dtype=torch.long) rewards = torch.tensor([transition[2] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_img = torch.tensor( [transition[3][0] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_nav = torch.tensor( [transition[3][1] for transition in mini_batch], device=self.device, dtype=torch.float) dones = torch.tensor( [not (transition[4]) for transition in mini_batch], device=self.device, dtype=torch.bool) actions_eval = torch.argmax(self.Q(new_current_states_img, new_current_states_nav), dim=1) next_state_values = self.target_Q( new_current_states_img, new_current_states_nav).gather( dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1) values = rewards + self.gamma * next_state_values * dones target_values = self.Q(current_states_img, current_states_nav).gather( dim=1, index=actions.unsqueeze(-1)).squeeze(-1) td_error = target_values - values return td_error.detach().cpu().numpy() def train_nn(self, replay, server): updated_step = 0 start_mem = psutil.virtual_memory().used while replay.get_size() < MIN_REPLAY_SIZE: continue print("Start learning") writer = SummaryWriter(comment="LossLearner") while updated_step < LEARNING_STEP: #Sample random minibatch of transitions from replay r = replay.get_size() mini_batch, weight = replay.get_batch( BATCH_SIZE) # appel remote object method weight = torch.tensor(weight, device=self.device, dtype=torch.float) #Split data transitions into multiples tensors current_states_img = torch.tensor( [transition[0][0] for transition in mini_batch], device=self.device, dtype=torch.float) current_states_nav = torch.tensor( [transition[0][1] for transition in mini_batch], device=self.device, dtype=torch.float) actions = torch.tensor( [transition[1] for transition in mini_batch], device=self.device, dtype=torch.long) rewards = torch.tensor( [transition[2] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_img = torch.tensor( [transition[3][0] for transition in mini_batch], device=self.device, dtype=torch.float) new_current_states_nav = torch.tensor( [transition[3][1] for transition in mini_batch], device=self.device, dtype=torch.float) dones = torch.tensor( [not (transition[4]) for transition in mini_batch], device=self.device, dtype=torch.bool) actions_eval = torch.argmax(self.Q(new_current_states_img, new_current_states_nav), dim=1) next_state_values = self.target_Q( new_current_states_img, new_current_states_nav).gather( dim=1, index=actions_eval.unsqueeze(-1)).squeeze(-1) values = rewards + GAMMA * next_state_values * dones target_values = self.Q(current_states_img, current_states_nav).gather( dim=1, index=actions.unsqueeze(-1)).squeeze(-1) td_error = target_values - values replay.update_error( td_error.detach().cpu().numpy()) # appel remote object method #fit/backpropagation self.Q.optimizer.zero_grad() loss_t = self.Q.loss_function(values * weight, target_values * weight) loss_t.backward() #for param in self.Q.parameters(): #param.grad.data.clamp(-1,1) self.Q.optimizer.step() #self.replay.update_memory() self.update_target() server.update_params( self.return_params()) # appel remote object method updated_step += 1 writer.add_scalar("Loss", loss_t, updated_step) writer.add_scalar("replay_size:", r, updated_step) writer.add_scalar("mem used:", (psutil.virtual_memory().used - start_mem) / 1_000_000, updated_step) print("finish") torch.save(self.Q.state_dict(), "./ModelSaved/Learner.pth") writer.close() def update_target(self): self.target_update_counter += 1 if self.target_update_counter > TARGET_UPDATE: self.target_Q.load_state_dict(self.Q.state_dict()) self.target_update_counter = 0 def return_params(self): params = [] for q_param in (self.Q.parameters()): params.append(q_param.detach().cpu()) return params