def __init__(self,meta_file): np.random.seed(seed = int(time.time())) self.num_slaves = 128 #80% #96=75% #48=70% #16=50% self.env = EnvManager(meta_file,self.num_slaves) self.use_muscle = self.env.UseMuscle() self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.99 self.lb = 0.99 self.buffer_size = 2048 #8192 #default=2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(30000) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(),self.num_action,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(),lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = -0.001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.stepPerEpisodeList=[] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() self.episodes = [None]*self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True)
class PPO(object): def __init__(self): np.random.seed(seed=int(time.time())) self.num_slaves = 16 self.env = EnvManager(self.num_slaves) self.use_muscle = self.env.UseMuscle() self.num_state = self.env.GetStateDofs() self.num_action = self.env.GetActionDofs() self.num_dofs = self.env.GetSystemDofs() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.95 self.buffer_size = 2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(30000) self.model = SimulationNN(self.num_state, self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(), self.num_dofs - 6, self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(), lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = 0.001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() self.episodes = [None] * self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True) def SaveModel(self): self.model.save('../nn/current.pt') self.muscle_model.save('../nn/current_muscle.pt') if self.max_return_epoch == self.num_evaluation: self.model.save('../nn/max.pt') self.muscle_model.save('../nn/max_muscle.pt') if self.num_evaluation % 100 == 0: self.model.save('../nn/' + str(self.num_evaluation // 100) + '.pt') self.muscle_model.save('../nn/' + str(self.num_evaluation // 100) + '_muscle.pt') def LoadModel(self, path): self.model.load('../nn/' + path + '.pt') self.muscle_model.load('../nn/' + path + '_muscle.pt') def ComputeTDandGAE(self): self.replay_buffer.Clear() self.muscle_buffer.Clear() self.sum_return = 0.0 for epi in self.total_episodes: data = epi.GetData() size = len(data) if size == 0: continue states, actions, rewards, values, logprobs = zip(*data) values = np.concatenate((values, np.zeros(1)), axis=0) advantages = np.zeros(size) ad_t = 0 epi_return = 0.0 for i in reversed(range(len(data))): epi_return += rewards[i] delta = rewards[i] + values[i + 1] * self.gamma - values[i] ad_t = delta + self.gamma * self.lb * ad_t advantages[i] = ad_t self.sum_return += epi_return TD = values[:size] + advantages for i in range(size): self.replay_buffer.Push(states[i], actions[i], logprobs[i], TD[i], advantages[i]) self.num_episode = len(self.total_episodes) self.num_tuple = len(self.replay_buffer.buffer) print('SIM : {}'.format(self.num_tuple)) self.num_tuple_so_far += self.num_tuple muscle_tuples = self.env.GetMuscleTuples() for i in range(len(muscle_tuples)): self.muscle_buffer.Push(muscle_tuples[i][0], muscle_tuples[i][1], muscle_tuples[i][2], muscle_tuples[i][3]) def GenerateTransitions(self): self.total_episodes = [] states = [None] * self.num_slaves actions = [None] * self.num_slaves rewards = [None] * self.num_slaves states_next = [None] * self.num_slaves states = self.env.GetStates() local_step = 0 terminated = [False] * self.num_slaves counter = 0 while True: counter += 1 if counter % 10 == 0: print('SIM : {}'.format(local_step), end='\r') a_dist, v = self.model(Tensor(states)) actions = a_dist.sample().cpu().detach().numpy() # actions = a_dist.loc.cpu().detach().numpy() logprobs = a_dist.log_prob( Tensor(actions)).cpu().detach().numpy().reshape(-1) values = v.cpu().detach().numpy().reshape(-1) self.env.SetActions(actions) if self.use_muscle: mt = Tensor(self.env.GetMuscleTorques()) for i in range(self.num_simulation_per_control // 2): dt = Tensor(self.env.GetDesiredTorques()) activations = self.muscle_model(mt, dt).cpu().detach().numpy() self.env.SetActivationLevels(activations) self.env.Steps(2) else: self.env.StepsAtOnce() for j in range(self.num_slaves): nan_occur = False terminated_state = True if np.any(np.isnan(states[j])) or np.any(np.isnan( actions[j])) or np.any(np.isnan(states[j])) or np.any( np.isnan(values[j])) or np.any( np.isnan(logprobs[j])): nan_occur = True elif self.env.IsEndOfEpisode(j) is False: terminated_state = False rewards[j] = self.env.GetReward(j) self.episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j]) local_step += 1 if terminated_state or (nan_occur is True): if (nan_occur is True): self.episodes[j].Pop() self.total_episodes.append(self.episodes[j]) self.episodes[j] = EpisodeBuffer() self.env.Reset(True, j) if local_step >= self.buffer_size: break states = self.env.GetStates() def OptimizeSimulationNN(self): all_transitions = np.array(self.replay_buffer.buffer) for j in range(self.num_epochs): np.random.shuffle(all_transitions) for i in range(len(all_transitions) // self.batch_size): transitions = all_transitions[i * self.batch_size:(i + 1) * self.batch_size] batch = Transition(*zip(*transitions)) stack_s = np.vstack(batch.s).astype(np.float32) stack_a = np.vstack(batch.a).astype(np.float32) stack_lp = np.vstack(batch.logprob).astype(np.float32) stack_td = np.vstack(batch.TD).astype(np.float32) stack_gae = np.vstack(batch.GAE).astype(np.float32) a_dist, v = self.model(Tensor(stack_s)) '''Critic Loss''' loss_critic = ((v - Tensor(stack_td)).pow(2)).mean() '''Actor Loss''' ratio = torch.exp( a_dist.log_prob(Tensor(stack_a)) - Tensor(stack_lp)) stack_gae = (stack_gae - stack_gae.mean()) / (stack_gae.std() + 1E-5) stack_gae = Tensor(stack_gae) surrogate1 = ratio * stack_gae surrogate2 = torch.clamp(ratio, min=1.0 - self.clip_ratio, max=1.0 + self.clip_ratio) * stack_gae loss_actor = -torch.min(surrogate1, surrogate2).mean() '''Entropy Loss''' loss_entropy = -self.w_entropy * a_dist.entropy().mean() self.loss_actor = loss_actor.cpu().detach().numpy().tolist() self.loss_critic = loss_critic.cpu().detach().numpy().tolist() loss = loss_actor + loss_entropy + loss_critic self.optimizer.zero_grad() loss.backward(retain_graph=True) for param in self.model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5, 0.5) self.optimizer.step() print('Optimizing sim nn : {}/{}'.format(j + 1, self.num_epochs), end='\r') print('') def OptimizeMuscleNN(self): muscle_transitions = np.array(self.muscle_buffer.buffer) for j in range(self.num_epochs_muscle): np.random.shuffle(muscle_transitions) for i in range(len(muscle_transitions) // self.muscle_batch_size): tuples = muscle_transitions[i * self.muscle_batch_size:(i + 1) * self.muscle_batch_size] batch = MuscleTransition(*zip(*tuples)) stack_JtA = np.vstack(batch.JtA).astype(np.float32) stack_tau_des = np.vstack(batch.tau_des).astype(np.float32) stack_L = np.vstack(batch.L).astype(np.float32) stack_L = stack_L.reshape(self.muscle_batch_size, self.num_dofs - 6, self.num_muscles) stack_b = np.vstack(batch.b).astype(np.float32) stack_JtA = Tensor(stack_JtA) stack_tau_des = Tensor(stack_tau_des) stack_L = Tensor(stack_L) stack_b = Tensor(stack_b) activation = self.muscle_model(stack_JtA, stack_tau_des) tau = torch.einsum('ijk,ik->ij', (stack_L, activation)) + stack_b loss_reg = (activation).pow(2).mean() loss_target = (((tau - stack_tau_des) / 100.0).pow(2)).mean() loss = 0.01 * loss_reg + loss_target # loss = loss_target self.optimizer_muscle.zero_grad() loss.backward(retain_graph=True) for param in self.muscle_model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5, 0.5) self.optimizer_muscle.step() print('Optimizing muscle nn : {}/{}'.format( j + 1, self.num_epochs_muscle), end='\r') self.loss_muscle = loss.cpu().detach().numpy().tolist() print('') def OptimizeModel(self): self.ComputeTDandGAE() self.OptimizeSimulationNN() if self.use_muscle: self.OptimizeMuscleNN() def Train(self): self.GenerateTransitions() self.OptimizeModel() def Evaluate(self): self.num_evaluation = self.num_evaluation + 1 h = int((time.time() - self.tic) // 3600.0) m = int((time.time() - self.tic) // 60.0) s = int((time.time() - self.tic)) m = m - h * 60 s = int((time.time() - self.tic)) s = s - h * 3600 - m * 60 if self.num_episode is 0: self.num_episode = 1 if self.num_tuple is 0: self.num_tuple = 1 if self.max_return < self.sum_return / self.num_episode: self.max_return = self.sum_return / self.num_episode self.max_return_epoch = self.num_evaluation print('# {} === {}h:{}m:{}s ==='.format(self.num_evaluation, h, m, s)) print('||Loss Actor : {:.4f}'.format(self.loss_actor)) print('||Loss Critic : {:.4f}'.format(self.loss_critic)) print('||Loss Muscle : {:.4f}'.format(self.loss_muscle)) print('||Noise : {:.3f}'.format( self.model.log_std.exp().mean())) print('||Num Transition So far : {}'.format(self.num_tuple_so_far)) print('||Num Transition : {}'.format(self.num_tuple)) print('||Num Episode : {}'.format(self.num_episode)) print('||Avg Return per episode : {:.3f}'.format(self.sum_return / self.num_episode)) print('||Avg Reward per transition: {:.3f}'.format(self.sum_return / self.num_tuple)) print('||Avg Step per episode : {:.1f}'.format(self.num_tuple / self.num_episode)) print('||Max Avg Retun So far : {:.3f} at #{}'.format( self.max_return, self.max_return_epoch)) self.rewards.append(self.sum_return / self.num_episode) self.SaveModel() print('=============================================') return np.array(self.rewards)
def __init__(self, meta_file, num_slaves=16): # plt.ion() np.random.seed(seed=int(time.time())) self.num_slaves = num_slaves self.env = EnvManager(meta_file, self.num_slaves) self.use_muscle = self.env.UseMuscle() self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.99 self.buffer_size = 8192 self.batch_size = 256 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(30000) self.model = SimulationNN(self.num_state, self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(), self.num_action, self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(), lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = -0.001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() # for adaptive sampling, marginal value training self.use_adaptive_sampling = self.env.UseAdaptiveSampling() self.marginal_state_num = self.env.GetMarginalStateNum() self.marginal_buffer = MargianlBuffer(30000) self.marginal_model = MarginalNN(self.marginal_state_num) self.marginal_value_avg = 1. self.marginal_learning_rate = 1e-3 self.marginal_optimizer = optim.SGD(self.marginal_model.parameters(), lr=self.marginal_learning_rate) self.marginal_loss = 0.0 self.marginal_samples = [] self.marginal_sample_num = 2000 self.marginal_k = self.env.GetMarginalParameter() self.mcmc_burn_in = 1000 self.mcmc_period = 20 if use_cuda: self.marginal_model.cuda() self.total_episodes = [] self.episodes = [None] * self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True)
class PPO(object): def __init__(self, meta_file, num_slaves=16): # plt.ion() np.random.seed(seed=int(time.time())) self.num_slaves = num_slaves self.env = EnvManager(meta_file, self.num_slaves) self.use_muscle = self.env.UseMuscle() self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.99 self.buffer_size = 8192 self.batch_size = 256 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(30000) self.model = SimulationNN(self.num_state, self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(), self.num_action, self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(), lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = -0.001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() # for adaptive sampling, marginal value training self.use_adaptive_sampling = self.env.UseAdaptiveSampling() self.marginal_state_num = self.env.GetMarginalStateNum() self.marginal_buffer = MargianlBuffer(30000) self.marginal_model = MarginalNN(self.marginal_state_num) self.marginal_value_avg = 1. self.marginal_learning_rate = 1e-3 self.marginal_optimizer = optim.SGD(self.marginal_model.parameters(), lr=self.marginal_learning_rate) self.marginal_loss = 0.0 self.marginal_samples = [] self.marginal_sample_num = 2000 self.marginal_k = self.env.GetMarginalParameter() self.mcmc_burn_in = 1000 self.mcmc_period = 20 if use_cuda: self.marginal_model.cuda() self.total_episodes = [] self.episodes = [None] * self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True) def SaveModel(self): self.model.save('../nn/current.pt') self.muscle_model.save('../nn/current_muscle.pt') if self.max_return_epoch == self.num_evaluation: self.model.save('../nn/max.pt') self.muscle_model.save('../nn/max_muscle.pt') if self.num_evaluation % 100 == 0: self.model.save('../nn/' + str(self.num_evaluation // 100) + '.pt') self.muscle_model.save('../nn/' + str(self.num_evaluation // 100) + '_muscle.pt') def LoadModel(self, path): self.model.load('../nn/' + path + '.pt') self.muscle_model.load('../nn/' + path + '_muscle.pt') def ComputeTDandGAE(self): self.replay_buffer.Clear() self.muscle_buffer.Clear() self.marginal_buffer.Clear() self.sum_return = 0.0 for epi in self.total_episodes: data = epi.GetData() size = len(data) if size == 0: continue states, actions, rewards, values, logprobs = zip(*data) values = np.concatenate((values, np.zeros(1)), axis=0) advantages = np.zeros(size) ad_t = 0 epi_return = 0.0 for i in reversed(range(len(data))): epi_return += rewards[i] delta = rewards[i] + values[i + 1] * self.gamma - values[i] ad_t = delta + self.gamma * self.lb * ad_t advantages[i] = ad_t self.sum_return += epi_return TD = values[:size] + advantages for i in range(size): self.replay_buffer.Push(states[i], actions[i], logprobs[i], TD[i], advantages[i]) if self.use_adaptive_sampling: for i in range(size): self.marginal_buffer.Push( states[i][-self.marginal_state_num:], values[i]) self.num_episode = len(self.total_episodes) self.num_tuple = len(self.replay_buffer.buffer) # print('SIM : {}'.format(self.num_tuple)) self.num_tuple_so_far += self.num_tuple muscle_tuples = self.env.GetMuscleTuples() for i in range(len(muscle_tuples)): self.muscle_buffer.Push(muscle_tuples[i][0], muscle_tuples[i][1], muscle_tuples[i][2], muscle_tuples[i][3]) def SampleStatesForMarginal(self): # MCMC : Metropolitan-Hastings _marginal_samples = [] marginal_sample_prob = [] marginal_sample_cumulative_prob = [] p_sb = 0. mcmc_idx = 0 while len(_marginal_samples) < self.marginal_sample_num: # Generation state_sb_prime = self.env.SampleMarginalState() # Evaluation marginal_value = self.marginal_model( Tensor(state_sb_prime)).cpu().detach().numpy().reshape(-1) # print(marginal_value, state_sb_prime) p_sb_prime = math.exp( self.marginal_k * (1. - marginal_value / self.marginal_value_avg)) # Rejection if p_sb_prime > np.random.rand() * p_sb: if mcmc_idx > self.mcmc_burn_in: _marginal_samples.append(state_sb_prime) marginal_sample_prob.append(p_sb_prime) p_sb = p_sb_prime mcmc_idx += 1 sorted_y_idx_list = sorted(range(len(marginal_sample_prob)), key=lambda x: marginal_sample_prob[x]) marginal_samples = [_marginal_samples[i] for i in sorted_y_idx_list] marginal_sample_prob.sort() marginal_sample_cumulative_prob.append(marginal_sample_prob[0]) for i in range(1, len(marginal_sample_prob)): marginal_sample_cumulative_prob.append( marginal_sample_prob[i] + marginal_sample_cumulative_prob[-1]) for i in range(len(marginal_sample_cumulative_prob)): marginal_sample_cumulative_prob[ i] = marginal_sample_cumulative_prob[ i] / marginal_sample_cumulative_prob[-1] # print(self.marginal_value_avg, sum(marginal_sample_cumulative_prob)) # plt.figure(0) # plt.clf() # stride_idx = len(marginal_samples[0])-2 # speed_idx = len(marginal_samples[0])-1 # xx = [] # yy = [] # # for marginal_sample in marginal_samples: # marginal_sample_exact = marginal_sample.copy() # marginal_sample_exact[stride_idx] *= math.sqrt(0.00323409929) # marginal_sample_exact[speed_idx] *= math.sqrt(0.00692930964) # marginal_sample_exact[stride_idx] += 1.12620703 # marginal_sample_exact[speed_idx] += 0.994335964 # # xx.append(marginal_sample_exact[stride_idx]) # yy.append(marginal_sample_exact[speed_idx]) # # plt.scatter(xx, yy) # # # plt.xlim(left=-3., right=3.) # # plt.ylim(bottom=-3., top=3.) # plt.xlim(left=0., right=2.) # plt.ylim(bottom=0., top=2.) # plt.show() # plt.pause(0.001) self.env.SetMarginalSampled(np.asarray(marginal_samples), marginal_sample_cumulative_prob) def GenerateTransitions(self): self.total_episodes = [] states = [None] * self.num_slaves actions = [None] * self.num_slaves rewards = [None] * self.num_slaves states_next = [None] * self.num_slaves states = self.env.GetStates() local_step = 0 terminated = [False] * self.num_slaves counter = 0 while True: counter += 1 # if counter % 10 == 0: # print('SIM : {}'.format(local_step),end='\r') a_dist, v = self.model(Tensor(states)) actions = a_dist.sample().cpu().detach().numpy() # actions = a_dist.loc.cpu().detach().numpy() logprobs = a_dist.log_prob( Tensor(actions)).cpu().detach().numpy().reshape(-1) values = v.cpu().detach().numpy().reshape(-1) self.env.SetActions(actions) if self.use_muscle: mt = Tensor(self.env.GetMuscleTorques()) for _ in range(self.num_simulation_per_control // 2): dt = Tensor(self.env.GetDesiredTorques()) activations = self.muscle_model(mt, dt).cpu().detach().numpy() self.env.SetActivationLevels(activations) self.env.Steps(2) else: self.env.StepsAtOnce() for j in range(self.num_slaves): nan_occur = False terminated_state = True if np.any(np.isnan(states[j])) or np.any(np.isnan( actions[j])) or np.any(np.isnan(states[j])) or np.any( np.isnan(values[j])) or np.any( np.isnan(logprobs[j])): nan_occur = True elif self.env.IsEndOfEpisode(j) is False: terminated_state = False rewards[j] = self.env.GetReward(j) self.episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j]) local_step += 1 if terminated_state or nan_occur: if nan_occur: self.episodes[j].Pop() self.total_episodes.append(self.episodes[j]) self.episodes[j] = EpisodeBuffer() self.env.Reset(True, j) if local_step >= self.buffer_size: break states = self.env.GetStates() def OptimizeSimulationNN(self): all_transitions = np.array(self.replay_buffer.buffer) for j in range(self.num_epochs): np.random.shuffle(all_transitions) for i in range(len(all_transitions) // self.batch_size): transitions = all_transitions[i * self.batch_size:(i + 1) * self.batch_size] batch = Transition(*zip(*transitions)) stack_s = np.vstack(batch.s).astype(np.float32) stack_a = np.vstack(batch.a).astype(np.float32) stack_lp = np.vstack(batch.logprob).astype(np.float32) stack_td = np.vstack(batch.TD).astype(np.float32) stack_gae = np.vstack(batch.GAE).astype(np.float32) a_dist, v = self.model(Tensor(stack_s)) '''Critic Loss''' loss_critic = ((v - Tensor(stack_td)).pow(2)).mean() '''Actor Loss''' ratio = torch.exp( a_dist.log_prob(Tensor(stack_a)) - Tensor(stack_lp)) stack_gae = (stack_gae - stack_gae.mean()) / (stack_gae.std() + 1E-5) stack_gae = Tensor(stack_gae) surrogate1 = ratio * stack_gae surrogate2 = torch.clamp(ratio, min=1.0 - self.clip_ratio, max=1.0 + self.clip_ratio) * stack_gae loss_actor = -torch.min(surrogate1, surrogate2).mean() '''Entropy Loss''' loss_entropy = -self.w_entropy * a_dist.entropy().mean() self.loss_actor = loss_actor.cpu().detach().numpy().tolist() self.loss_critic = loss_critic.cpu().detach().numpy().tolist() loss = loss_actor + loss_entropy + loss_critic self.optimizer.zero_grad() loss.backward(retain_graph=True) for param in self.model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5, 0.5) self.optimizer.step() # print('Optimizing sim nn : {}/{}'.format(j+1,self.num_epochs),end='\r') # print('') def OptimizeMuscleNN(self): muscle_transitions = np.array(self.muscle_buffer.buffer) for j in range(self.num_epochs_muscle): np.random.shuffle(muscle_transitions) for i in range(len(muscle_transitions) // self.muscle_batch_size): tuples = muscle_transitions[i * self.muscle_batch_size:(i + 1) * self.muscle_batch_size] batch = MuscleTransition(*zip(*tuples)) stack_JtA = np.vstack(batch.JtA).astype(np.float32) stack_tau_des = np.vstack(batch.tau_des).astype(np.float32) stack_L = np.vstack(batch.L).astype(np.float32) stack_L = stack_L.reshape(self.muscle_batch_size, self.num_action, self.num_muscles) stack_b = np.vstack(batch.b).astype(np.float32) stack_JtA = Tensor(stack_JtA) stack_tau_des = Tensor(stack_tau_des) stack_L = Tensor(stack_L) stack_b = Tensor(stack_b) activation = self.muscle_model(stack_JtA, stack_tau_des) tau = torch.einsum('ijk,ik->ij', (stack_L, activation)) + stack_b loss_reg = (activation).pow(2).mean() loss_target = (((tau - stack_tau_des) / 100.0).pow(2)).mean() loss = 0.01 * loss_reg + loss_target # loss = loss_target self.optimizer_muscle.zero_grad() loss.backward(retain_graph=True) for param in self.muscle_model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5, 0.5) self.optimizer_muscle.step() # print('Optimizing muscle nn : {}/{}'.format(j+1,self.num_epochs_muscle),end='\r') self.loss_muscle = loss.cpu().detach().numpy().tolist() # print('') def OptimizeMarginalNN(self): marginal_transitions = np.array(self.marginal_buffer.buffer) for j in range(self.num_epochs): np.random.shuffle(marginal_transitions) for i in range(len(marginal_transitions) // self.batch_size): transitions = marginal_transitions[i * self.batch_size:(i + 1) * self.batch_size] batch = MarginalTransition(*zip(*transitions)) stack_sb = np.vstack(batch.sb).astype(np.float32) stack_v = np.vstack(batch.v).astype(np.float32) v = self.marginal_model(Tensor(stack_sb)) # Marginal Loss loss_marginal = ((v - Tensor(stack_v)).pow(2)).mean() self.marginal_loss = loss_marginal.cpu().detach().numpy( ).tolist() self.marginal_optimizer.zero_grad() loss_marginal.backward(retain_graph=True) for param in self.marginal_model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5, 0.5) self.marginal_optimizer.step() # Marginal value average avg_marginal = Tensor( stack_v).mean().cpu().detach().numpy().tolist() self.marginal_value_avg -= self.marginal_learning_rate * ( self.marginal_value_avg - avg_marginal) # print('Optimizing margin nn : {}/{}'.format(j+1, self.num_epochs), end='\r') # print('') def OptimizeModel(self): self.ComputeTDandGAE() self.OptimizeSimulationNN() if self.use_muscle: self.OptimizeMuscleNN() if self.use_adaptive_sampling: self.OptimizeMarginalNN() def Train(self, idx): if self.use_adaptive_sampling and (idx % self.mcmc_period == 0): self.SampleStatesForMarginal() self.GenerateTransitions() self.OptimizeModel() def Evaluate(self): self.num_evaluation = self.num_evaluation + 1 h = int((time.time() - self.tic) // 3600.0) m = int((time.time() - self.tic) // 60.0) s = int((time.time() - self.tic)) s = s - m * 60 m = m - h * 60 if self.num_episode is 0: self.num_episode = 1 if self.num_tuple is 0: self.num_tuple = 1 if self.max_return < self.sum_return / self.num_episode: self.max_return = self.sum_return / self.num_episode self.max_return_epoch = self.num_evaluation with open('../nn/log.txt', 'a') as f: f.write('# {} === {}h:{}m:{}s ===\n'.format( self.num_evaluation, h, m, s)) f.write('||Loss Actor : {:.4f}\n'.format( self.loss_actor)) f.write('||Loss Critic : {:.4f}\n'.format( self.loss_critic)) if self.use_muscle: f.write('||Loss Muscle : {:.4f}\n'.format( self.loss_muscle)) if self.use_adaptive_sampling: f.write('||Loss Marginal : {:.4f}\n'.format( self.marginal_loss)) f.write('||Noise : {:.3f}\n'.format( self.model.log_std.exp().mean())) f.write('||Num Transition So far : {}\n'.format( self.num_tuple_so_far)) f.write('||Num Transition : {}\n'.format(self.num_tuple)) f.write('||Num Episode : {}\n'.format( self.num_episode)) f.write('||Avg Return per episode : {:.3f}\n'.format( self.sum_return / self.num_episode)) f.write('||Avg Reward per transition: {:.3f}\n'.format( self.sum_return / self.num_tuple)) f.write('||Avg Step per episode : {:.1f}\n'.format( self.num_tuple / self.num_episode)) f.write('||Max Avg Retun So far : {:.3f} at #{}\n'.format( self.max_return, self.max_return_epoch)) f.write('=============================================\n') self.rewards.append(self.sum_return / self.num_episode) self.SaveModel() return np.array(self.rewards)