def train_v2(self, model_path, log_path): ''' every time slot, it updates netowrk parameters like TD-error ''' self.epi_rewards_mean, self.epi_durations, self.episode = [], [], [] self.actor_losses, self.critic_losses = [], [] self.step_count = [] for epi in range(self.conf['num_episode']): # episodes log_probs, values, rewards, masks = [], [], [], [] entropy, epi_reward, epi_duration, step = 0, 0.0, 0.0, 0 print("--- episode %s ---" % (epi)) self.episode.append(epi) state = self.env.reset() # [2*num_plant, 1] t = P.t_start while t < P.t_end: # one episode (simulation) epi_duration = t if round(t, 3) * 1000 % 10 == 0: # every 10 ms, schedule udpate # action = self.agent.select_action(state_ts) # action type: tensor [1X1] state_ts = to_tensor(state).reshape(-1) # [1] dist, value = self.agent.actor( state_ts), self.agent.critic( state_ts) # pi(s) and V(s) action = dist.sample() # scalar of a tensor next_state, reward, done, info = self.env.step( action.cpu().numpy().item(), t) # shape of next_state : [(2*num_plant) X 1] actor_loss, critic_loss = self.agent.optimization_model_v2( dist, action, state_ts, to_tensor(next_state).reshape(-1), reward, done) state = next_state epi_reward += reward step += 1 self.step_count.append(step) rewards.append(reward) self.actor_losses.append(actor_loss) self.critic_losses.append(critic_loss) if done: break else: # every 1 ms self.env.update_plant_state(t) # plant status update t = t + P.Ts # optimize print("epi_duration:", epi_duration) print("mean epi_reward:", epi_reward / len(rewards)) # episode done self.epi_rewards_mean.append(epi_reward / len(rewards)) self.epi_durations.append(epi_duration) # Save satet_dict torch.save(self.agent.actor.state_dict(), model_path) # torch.save(self.agent.critic.state_dict(), CRITIC_MODEL) self.save_log(log_path)
def train(self, model_path, log_path): self.epi_durations, self.epi_rewards_mean, self.episode, self.step_count, self.epi_loss = [], [], [], [], [] for epi in range(self.conf['num_episode']): # episodes rewards = [] self.episode.append(epi) step = 0 epi_reward = 0.0 print("--- episode %s ---" % (epi)) state = self.env.reset() state_ts = to_tensor(state).reshape(-1).unsqueeze(0) t = P.t_start while t < P.t_end: # one episode (simulation) epi_duration = t if round(t, 3) * 1000 % 10 == 0: # every 10 ms, schedule udpate action = self.agent.select_action(state_ts) next_state, reward, done, info = self.env.step( action.item(), t) # shape of next_state : [(2*num_plant) X 1] done_mask = 0.0 if done else 1.0 next_state_ts = to_tensor(next_state).reshape( -1).unsqueeze(0) reward_ts = to_tensor(np.asarray(reward).reshape(-1)) self.agent.memory.push_transition(state_ts, action, next_state_ts, reward_ts) state_ts = next_state_ts epi_reward += reward step += 1 self.step_count.append(step) rewards.append(reward) if self.agent.memory.length( ) >= self.conf['memory_capacity']: loss = self.agent.update() self.epi_loss.append(loss) if done: break else: # every 1 ms self.env.update_plant_state(t) # plant status update t = t + P.Ts if epi % self.conf['target_update'] == 0 and epi != 0: print("target update") self.agent.q_target.load_state_dict(self.agent.q.state_dict()) self.epi_durations.append(epi_duration) self.epi_rewards_mean.append(epi_reward) print("epi_duration:", epi_duration) # print("epi_reward:%s, len:%s"%(epi_reward, len(rewards))) print("mean epi_reward:", epi_reward / len(rewards)) if epi % 10 == 0 and epi != 0: torch.save(self.agent.q.state_dict(), model_path) self.save_log(log_path) torch.save(self.agent.q.state_dict(), model_path) self.save_log(log_path)
def test(self, env, model_path): test_actions = [] # new agent new_agent = DQN(self.conf, self.device) new_agent.load_model(model_path) epi_reward = 0.0 epi_duration = 0.0 state = env.reset() # [2*num_plant, 1] state_ts = to_tensor(state).unsqueeze( 0 ) # [1, 2*num_plant, 1] # unsqueeze(0) on 'state' is necessary for reply memory # realtimePlot = testDataPlotter(self.conf) t = P.t_start while t < P.t_end: t_next_plot = t + P.t_plot epi_duration = t while t < t_next_plot: # data plot period if round(t, 3) * 1000 % 10 == 0: # every 10 ms, schedule udpate action = new_agent.select_action( state_ts) # action type: tensor [1X1] next_state, reward, done, info = env.step( action.item(), t) # shape of next_state : [(2*num_plant) X 1] epi_reward += reward test_actions.append(action.item()) if done: break else: # every 1 ms env.update_plant_state(t) # plant status update t = t + P.Ts # self.update_dataPlot(realtimePlot, t, env) # update data plot if done: break
def optimization_model(self, next_state, rewards, log_probs, values, masks): ''' next_state : episode last state, which is used for G_t (return) rewards : a list that includes all rewards during an episode log_probs : a list that includes all log pi(a_t|s_t) during an episode, relative to actor network values : a list that includes all V(s_t), relative to critic network ''' next_state_ts = to_tensor(next_state).reshape(-1) # [5*num_plant] next_value = self.critic(next_state_ts) # V(s_{t+1}) 전체 분포? returns = self.compute_returns(next_value, rewards, masks) # G_t = R + gamma * G_{t+1} log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values # A = G_t - V(s_t) actor_loss = -(log_probs * advantage.detach()).mean() # -(log_pi(a|s) * A) 의 평균 critic_loss = advantage.pow(2).mean() # A^2 의 평균? self.optimizerA.zero_grad() self.optimizerC.zero_grad() actor_loss.backward() critic_loss.backward() self.optimizerA.step() self.optimizerC.step() return actor_loss, critic_loss
def scheduler(self, state): ''' input: states for all plants output: current scheduled plant ''' state_ts = to_tensor(state, is_cuda=False, device=self.device).reshape(-1) # print("state_ts:", state_ts) dist = self.agent.actor(state_ts) action = dist.sample() return action
def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor( to_tensor(state).reshape(-1).unsqueeze(0) ) # input shape = [batch(=1) X state_dim], action : type (tuple), shape [batch X action_dim] action = action.cpu().detach().numpy().squeeze( 0) # action shape [action_dim,] if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, 0., 1.) # input 중 -1~1 을 벗어나는 값에 대해 -1 or 1 로 대체 if decay_epsilon: self.epsilon -= self.depsilon return action
def action_and_envStep(self, agent, env, t, state, algorithm): if algorithm == 'A2C' or algorithm == 'random': state_ts = to_tensor(state).reshape(-1) dist = agent.actor(state_ts) action = dist.sample() # schedule = env.action_to_schedule_v2(action.cpu().numpy(), self.conf['action_dim']) next_state, reward, done, info = env.step( action.cpu().numpy().item(), t) elif algorithm == 'sequence': action = agent.select_seqAction() # schedule = env.action_to_schedule_v2(action.cpu().numpy(), self.conf['action_dim']) next_state, reward, done, info = env.step( action.cpu().numpy().item(), t) if done == True: print("done true") return action, next_state, reward, done, info
def update_policy(self, memory, gamma=0.99): print("updating...") # Sample batch experiences = memory.sample( self.conf['batch_size'] ) # type: list | shape: (max_epi_length(2000)-1 X batch(32) X 5(??)) if len(experiences) == 0: # not enough samples return dtype = torch.cuda.FloatTensor policy_loss_total = 0 value_loss_total = 0 for t in range(len(experiences) - 1): # iterate over episodes # print("t:", t) target_cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) target_hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) # we first get the data out of the sampled experience # shape of state0, action, reward: [batch X state_dim], [batch X 1], [batch X 1] state0 = np.stack([ trajectory.state0 for trajectory in experiences[t] ]) # batch 개수만큼 각 epi 중 t 시점에서 상태만 추출 # action = np.expand_dims(np.stack((trajectory.action for trajectory in experiences[t])), axis=1) action = np.stack( [trajectory.action for trajectory in experiences[t]]) reward = np.expand_dims(np.stack( [trajectory.reward for trajectory in experiences[t]]), axis=1) # reward = np.stack((trajectory.reward for trajectory in experiences[t])) state1 = np.stack( [trajectory.state0 for trajectory in experiences[t + 1]]) target_action, (target_hx, target_cx) = self.actor_target( to_tensor(state1).reshape(self.conf['batch_size'], -1), (target_hx, target_cx)) next_q_value = self.critic_target([ to_tensor(state1).reshape(self.conf['batch_size'], -1), target_action ]) target_q = to_tensor(reward) + gamma * next_q_value # Critic update current_q = self.critic([ to_tensor(state0).reshape(self.conf['batch_size'], -1), to_tensor(action) ]) value_loss = F.smooth_l1_loss(current_q, target_q) value_loss /= len(experiences) # divide by trajectory length value_loss_total += value_loss # update per trajectory self.critic.zero_grad() value_loss.backward() # Actor update action, (hx, cx) = self.actor( to_tensor(state0).reshape(self.conf['batch_size'], -1), (hx, cx)) policy_loss = -self.critic([ to_tensor(state0).reshape(self.conf['batch_size'], -1), action ]) policy_loss /= len(experiences) # divide by trajectory length policy_loss_total += policy_loss.mean() policy_loss = policy_loss.mean() self.actor.zero_grad() policy_loss.backward() self.critic_optim.step() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) print("update finish!")
def train(self, model_path, log_path): self.epi_rewards_mean, self.epi_durations, self.episode = [], [], [] self.actor_losses, self.critic_losses = [], [] self.step_count = [] for epi in range(self.conf['num_episode']): # episodes log_probs, values, rewards, masks = [], [], [], [] entropy, epi_reward, epi_duration, step = 0, 0.0, 0.0, 0 print("--- episode %s ---" % (epi)) self.episode.append(epi) state = self.env.reset() # [2*num_plant, 1] t = P.t_start while t < P.t_end: # one episode (simulation) epi_duration = t if round(t, 3) * 1000 % 10 == 0: # every 10 ms, schedule udpate # action = self.agent.select_action(state_ts) # action type: tensor [1X1] state_ts = to_tensor(state).reshape(-1) # [1] dist, value = self.agent.actor( state_ts), self.agent.critic( state_ts) # pi(s) and V(s) action = dist.sample() # scalar of a tensor next_state, reward, done, info = self.env.step( action.cpu().numpy().item(), t) # shape of next_state : [(2*num_plant) X 1] log_prob = dist.log_prob(action).unsqueeze( 0) # [1] : log pi(a_t|s_t) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.tensor([reward], dtype=torch.float, device=self.device)) masks.append( torch.tensor([1 - done], dtype=torch.float, device=self.device)) # print("step reward: ", reward) state = next_state epi_reward += reward step += 1 self.step_count.append(step) if done: break else: # every 1 ms self.env.update_plant_state(t) # plant status update t = t + P.Ts # optimize - monte-carlo actor_loss, critic_loss = self.agent.optimization_model( next_state, rewards, log_probs, values, masks) print("epi_duration:", epi_duration) print("mean epi_reward:", epi_reward / len(rewards)) # episode done self.epi_rewards_mean.append(epi_reward / len(rewards)) self.epi_durations.append(epi_duration) self.actor_losses.append(actor_loss.item()) self.critic_losses.append(critic_loss.item()) if epi % 10 == 0: torch.save(self.agent.actor.state_dict(), model_path) self.save_log(log_path) # Save satet_dict torch.save(self.agent.actor.state_dict(), model_path) self.save_log(log_path)