def update(self, target_Q, count): if len(self.buffer_object) < self.params['batch_size']: return 0 s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix = self.buffer_object.sample(self.params['batch_size']) r_matrix = numpy.clip(r_matrix, a_min=-self.params['reward_clip'], a_max=self.params['reward_clip']) s_matrix = torch.from_numpy(s_matrix).float().to(self.device) a_matrix = torch.from_numpy(a_matrix).float().to(self.device) r_matrix = torch.from_numpy(r_matrix).float().to(self.device) done_matrix = torch.from_numpy(done_matrix).float().to(self.device) sp_matrix = torch.from_numpy(sp_matrix).float().to(self.device) Q_star, _ = target_Q.get_best_qvalue_and_action(sp_matrix) Q_star = Q_star.reshape((self.params['batch_size'], -1)) with torch.no_grad(): y = r_matrix + self.params['gamma'] * (1 - done_matrix) * Q_star y_hat = self.forward(s_matrix, a_matrix) loss = self.criterion(y_hat, y) self.zero_grad() loss.backward() self.optimizer.step() self.zero_grad() utils_for_q_learning.sync_networks( target=target_Q, online=self, alpha=self.params['target_network_learning_rate'], copy=False) return loss.cpu().data.numpy()
def update(self, target_Q): if len(self.buffer_object.storage) < params['batch_size']: return else: pass batch = random.sample(self.buffer_object.storage, params['batch_size']) s_li = [b['s'] for b in batch] sp_li = [b['sp'] for b in batch] r_li = [b['r'] for b in batch] done_li = [b['done'] for b in batch] a_li = [b['a'] for b in batch] s_matrix = numpy.array(s_li).reshape(params['batch_size'], self.state_size) a_matrix = numpy.array(a_li).reshape(params['batch_size'], self.action_size) r_matrix = numpy.array(r_li).reshape(params['batch_size'], 1) r_matrix = numpy.clip(r_matrix, a_min=-self.params['reward_clip'], a_max=self.params['reward_clip']) sp_matrix = numpy.array(sp_li).reshape(params['batch_size'], self.state_size) done_matrix = numpy.array(done_li).reshape(params['batch_size'], 1) #self.train() Q_star = target_Q.get_best_centroid_batch(torch.FloatTensor(sp_matrix)) #print(Q_star[0]) #Q_star = target_Q.get_best_centroid_batch(torch.FloatTensor(sp_matrix)) #print(Q_star[0]) #assert False #assert False Q_star = Q_star.reshape((params['batch_size'], -1)) #print(Q_star.shape) y = r_matrix + self.params['gamma'] * (1 - done_matrix) * Q_star y_hat = self.forward(torch.FloatTensor(s_matrix), torch.FloatTensor(a_matrix)) loss = self.criterion(y_hat, torch.FloatTensor(y)) self.zero_grad() loss.backward() self.optimizer.step() self.zero_grad() utils_for_q_learning.sync_networks( target=target_Q, online=self, alpha=params['target_network_learning_rate'], copy=False)
s0 = env.reset() utils_for_q_learning.action_checker(env) Q_object = Net(params, env, state_size=len(s0), action_size=len(env.action_space.low), device=device) Q_object_target = Net(params, env, state_size=len(s0), action_size=len(env.action_space.low), device=device) Q_object_target.eval() utils_for_q_learning.sync_networks(target=Q_object_target, online=Q_object, alpha=params['target_network_learning_rate'], copy=True) G_li = [] loss_li = [] all_times_per_steps = [] all_times_per_updates = [] for episode in range(params['max_episode']): print("episode {}".format(episode)) Q_this_episode = Net(params, env, state_size=len(s0), action_size=len(env.action_space.low), device=device) utils_for_q_learning.sync_networks(target=Q_this_episode, online=Q_object,