def __init__(self, agent: Agent, val_env: gym.Env, lr, memory_size, target_update_freq, gradient_update_freq, batch_size, replay_start, val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir, weight_dir): """ :param agent: agent object :param val_env: environment for validation :param lr: learning rate of optimizer :param memory_size: size of replay memory :param target_update_freq: frequency of update target network in steps :param gradient_update_freq: frequency of q-network update in steps :param batch_size: batch size for q-net :param replay_start: number of random exploration before starting :param val_freq: frequency of validation in steps :param log_freq_by_step: frequency of logging in steps :param log_freq_by_ep: frequency of logging in episodes :param val_epsilon: exploration rate for validation :param log_dir: directory for saving tensorboard things :param weight_dir: directory for saving weights when validated """ self.agent = agent self.env = self.agent.env self.val_env = val_env self.optimizer = optim.RMSprop(params=self.agent.net.parameters(), lr=lr) self.memory = Memory(memory_size) self.target_update_freq = target_update_freq self.batch_size = batch_size self.replay_start = replay_start self.gradient_update_freq = gradient_update_freq self._step = 0 self._episode = 0 self._warmed = False self._val_freq = val_freq self.log_freq_by_step = log_freq_by_step self.log_freq_by_ep = log_freq_by_ep self._val_epsilon = val_epsilon self._writer = SummaryWriter( os.path.join(log_dir, datetime.now().strftime('%b%d_%H-%M-%S'))) if weight_dir is not None and not os.path.exists(weight_dir): os.makedirs(weight_dir) self.weight_dir = weight_dir
def __init__(self, agent: Agent, val_env: gym.Env, lr, memory_size, target_update_freq, gradient_update_freq, batch_size, replay_start, val_freq, log_freq_by_step, log_freq_by_ep, log_dir, weight_dir): """ :param agent: agent object :param val_env: environment for validation :param lr: learning rate of optimizer :param memory_size: size of replay memory :param target_update_freq: frequency of update target network in steps :param gradient_update_freq: frequency of q-network update in steps :param batch_size: :param replay_start: :param val_freq: :param log_freq_by_step: :param log_freq_by_ep: :param log_dir: :param weight_dir: """ self.agent = agent self.env = self.agent.env self.val_env = val_env self.optimizer = optim.RMSprop(params=self.agent.net.parameters(), lr=lr) self.memory = Memory(memory_size) self.target_update_freq = target_update_freq self.batch_size = batch_size self.replay_start = replay_start self.gradient_update_freq = gradient_update_freq self._step = 0 self._episode = 0 self._warmed = False self._val_freq = val_freq self.log_freq_by_step = log_freq_by_step self.log_freq_by_ep = log_freq_by_ep self.writer = SummaryWriter(log_dir) if weight_dir is not None and not os.path.exists(weight_dir): os.makedirs(weight_dir) self.weight_dir = weight_dir
class Trainer(object): def __init__(self, agent: Agent, val_env: gym.Env, lr, memory_size, target_update_freq, gradient_update_freq, batch_size, replay_start, val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir, weight_dir): """ :param agent: agent object :param val_env: environment for validation :param lr: learning rate of optimizer :param memory_size: size of replay memory :param target_update_freq: frequency of update target network in steps :param gradient_update_freq: frequency of q-network update in steps :param batch_size: batch size for q-net :param replay_start: number of random exploration before starting :param val_freq: frequency of validation in steps :param log_freq_by_step: frequency of logging in steps :param log_freq_by_ep: frequency of logging in episodes :param val_epsilon: exploration rate for validation :param log_dir: directory for saving tensorboard things :param weight_dir: directory for saving weights when validated """ self.agent = agent self.env = self.agent.env self.val_env = val_env self.optimizer = optim.RMSprop(params=self.agent.net.parameters(), lr=lr) self.memory = Memory(memory_size) self.target_update_freq = target_update_freq self.batch_size = batch_size self.replay_start = replay_start self.gradient_update_freq = gradient_update_freq self._step = 0 self._episode = 0 self._warmed = False self._val_freq = val_freq self.log_freq_by_step = log_freq_by_step self.log_freq_by_ep = log_freq_by_ep self._val_epsilon = val_epsilon self._writer = SummaryWriter( os.path.join(log_dir, datetime.now().strftime('%b%d_%H-%M-%S'))) if weight_dir is not None and not os.path.exists(weight_dir): os.makedirs(weight_dir) self.weight_dir = weight_dir def warm_up(self): # to populate replay memory state_before = self.env.reset() self._warmed = True for _ in tqdm(range(self.replay_start)): action = self.env.action_space.sample() state_after, reward, done, _ = self.env.step(action) self.memory( Transition(state_before, action, reward, state_after, done)) state_before = self.env.reset() if done else state_after def _train_nn(self): # neural network part self.optimizer.zero_grad() batch_state_before, batch_action, batch_reward, batch_state_after, batch_done = self.get_batch( ) target = self.agent.estimate_value(batch_reward, batch_state_after, batch_done) q_value = self.agent.q_value(batch_state_before, batch_action) loss = self.agent.net.loss(q_value, target) if self._step % self.gradient_update_freq == 0: loss.backward() self.optimizer.step() if self._step % self.log_freq_by_step == 0: self._writer.add_scalar("epsilon", self.agent.epsilon, self._step) self._writer.add_scalar("q_net-target", (q_value.data - target.data).mean(), self._step) self._writer.add_scalar("loss", loss.data.cpu()[0], self._step) return loss.data[0] def _loop(self, is_train): # internal loop for both training and validation done = False state_before = self.env.reset() if is_train else self.val_env.reset() loss_list = [] reward_list = [] while not done: epsilon = self.agent.epsilon if is_train else self._val_epsilon action = self.agent.policy(state_before, epsilon) state_after, reward, done, _ = self.env.step( action) if is_train else self.val_env.step(action) if is_train: self._step += 1 self.memory( Transition(state_before, action, reward, state_after, done)) self.agent.parameter_scheduler(self._step) loss_list.append(self._train_nn()) state_before = state_after reward_list.append(reward) if self._step % self.target_update_freq == 0 and is_train: self.agent.update_target_net() if self._step % self._val_freq == 0 and is_train: self.val() return loss_list, reward_list, state_after def train(self, max_step): if not self._warmed: self.warm_up() try: while self._step < max_step: self._episode += 1 train_loss, train_reward, _state = self._loop(is_train=True) if self._episode == 1: # for checking if the input is correct self._writer.add_image("input", to_tensor(_state)[0], 0) if self._episode % self.log_freq_by_ep == 0: self._writer.add_scalar("reward", sum(train_reward), self._step) for name, param in self.agent.net.named_parameters(): self._writer.add_histogram( f"qnet-{name}", param.clone().cpu().data.numpy(), self._step) for name, param in self.agent.target_net.named_parameters( ): self._writer.add_histogram( f"target-{name}", param.clone().cpu().data.numpy(), self._step) print( f"episode: {self._episode:>5}/step: {self._step:>6}/" f"loss: {np.mean(train_loss):>7.2f}/reward: {sum(train_reward):.2f}/size: {len(train_loss)}" ) except KeyboardInterrupt as ke: print(ke) finally: self._writer.close() def val(self): # validation _, val_reward, _ = self._loop(is_train=False) self._writer.add_scalar("val_reward", sum(val_reward), self._step) if self.weight_dir is not None: self.agent.save(os.path.join(self.weight_dir, f"{self._step}.pkl")) def get_batch(self): # return batch batch = self.memory.sample(self.batch_size) batch_state_before = torch.cat( [to_tensor(m.state_before).unsqueeze(0) for m in batch], dim=0) batch_action = torch.LongTensor([m.action for m in batch]) batch_reward = torch.Tensor([m.reward for m in batch]) batch_state_after = torch.cat( [to_tensor(m.state_after).unsqueeze(0) for m in batch], dim=0) # tensor: 0 if done else 1 batch_done = 1 - torch.Tensor([m.done for m in batch]) return batch_state_before, batch_action, batch_reward, batch_state_after, batch_done